├── Scrapy
    ├── tutorial
    │   ├── tutorial
    │   │   ├── __init__.py
    │   │   ├── spiders
    │   │   │   └── __init__.py
    │   │   ├── pipelines.py.tmpl
    │   │   ├── items.py.tmpl
    │   │   └── settings.py.tmpl
    │   └── scrapy.cfg
    ├── 00-Scrapy安装.txt
    ├── 02-Scrapy创建项目.txt
    └── 01-Scrapy安装失败解决方案.txt
├── __pycache__
    └── ChromeCookies.cpython-34.pyc
├── 17-Phantomjs.py
├── .gitattributes
├── 13-CookieDeciphering.py
├── .gitignore
├── 08-IdentifyingCode.py
├── 09-downPicture.py
├── 14-ChromePassword.py
├── 24-FilesDownload.py
├── README.md
├── 01-URL.py
├── 07-BaiduLenovo.py
├── ChromeCookies.py
├── 23-C315Check.py
├── 16-selenium.py
├── 18-WeiboAnalbum.py
├── 12-ChromeCookie1.py
├── 12-ChromeCookie2.py
├── 10-zhihuLogin.py
├── 15-ZhihuAnswerList.py
├── 02-BFS.py
├── 05-tieba.py
├── 06-JDprice.py
├── 21-DoubanMovieTypeTop.py
├── 04-Login.py
├── 20-DoubanMovieTop250.py
├── 22-PyQuery.py
├── 11-CSDNBlogList.py
├── 03-Chrome.py
└── 19-BeautifulSoup.py


/Scrapy/tutorial/tutorial/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Scrapy/00-Scrapy安装.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jueee/PythonWebCrawlers/HEAD/Scrapy/00-Scrapy安装.txt


--------------------------------------------------------------------------------
/Scrapy/02-Scrapy创建项目.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jueee/PythonWebCrawlers/HEAD/Scrapy/02-Scrapy创建项目.txt


--------------------------------------------------------------------------------
/Scrapy/01-Scrapy安装失败解决方案.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jueee/PythonWebCrawlers/HEAD/Scrapy/01-Scrapy安装失败解决方案.txt


--------------------------------------------------------------------------------
/__pycache__/ChromeCookies.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jueee/PythonWebCrawlers/HEAD/__pycache__/ChromeCookies.cpython-34.pyc


--------------------------------------------------------------------------------
/Scrapy/tutorial/tutorial/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/17-Phantomjs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | '''
 4 | 动态爬虫工具 Phantomjs 的安装与使用（通过JS渲染实现）。
 5 | '''
 6 | '''
 7 | Phantomjs 安装：到PhantomJS的官方网站上下载，然后放到python的安装目录。
 8 | 官网地址：http://phantomjs.org/download.html
 9 | '''
10 | from tornado_fetcher import Fetcher


--------------------------------------------------------------------------------
/Scrapy/tutorial/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ${project_name}.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ${project_name}
12 | 


--------------------------------------------------------------------------------
/Scrapy/tutorial/tutorial/pipelines.py.tmpl:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class ${ProjectName}Pipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/Scrapy/tutorial/tutorial/items.py.tmpl:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ${ProjectName}Item(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/13-CookieDeciphering.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Chrome 33+浏览器 Cookies encrypted_value解密脚本
 3 | '''
 4 | '''
 5 | Chrome浏览器版本33以上对Cookies进行了加密.
 6 | 
 7 | 用SQLite Developer打开Chrome的Cookies文件就会发现，原来的value字段已经为空，取而代之的是加密的encrypted_value。
 8 | '''
 9 | import sqlite3
10 | import win32crypt
11 | import os
12 | 
13 | cookie_file_path = os.path.join(os.environ['LOCALAPPDATA'],r'Google\Chrome\User Data\Default\Cookies')
14 | print('Cookies文件的地址为：%s' % cookie_file_path)
15 | if not os.path.exists(cookie_file_path):
16 |     raise Exception('Cookies 文件不存在...')
17 | sql_exe="select host_key,name,value,path,encrypted_value from cookies";
18 | conn = sqlite3.connect(cookie_file_path)
19 | for row in conn.execute(sql_exe):
20 |     ret = win32crypt.CryptUnprotectData(row[4], None, None, None, 0)
21 |     print('Cookie的Key：%-40s,Cookie名：%-50s，Cookie值：%s' % (row[0],row[1],ret[1].decode()))
22 | conn.close()
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /result/*
 2 | 
 3 | # Windows image file caches
 4 | Thumbs.db
 5 | ehthumbs.db
 6 | 
 7 | # Folder config file
 8 | Desktop.ini
 9 | 
10 | # Recycle Bin used on file shares
11 | $RECYCLE.BIN/
12 | 
13 | # Windows Installer files
14 | *.cab
15 | *.msi
16 | *.msm
17 | *.msp
18 | 
19 | # Windows shortcuts
20 | *.lnk
21 | 
22 | # =========================
23 | # Operating System Files
24 | # =========================
25 | 
26 | # OSX
27 | # =========================
28 | 
29 | .DS_Store
30 | .AppleDouble
31 | .LSOverride
32 | 
33 | # Thumbnails
34 | ._*
35 | 
36 | # Files that might appear in the root of a volume
37 | .DocumentRevisions-V100
38 | .fseventsd
39 | .Spotlight-V100
40 | .TemporaryItems
41 | .Trashes
42 | .VolumeIcon.icns
43 | 
44 | # Directories potentially created on remote AFP share
45 | .AppleDB
46 | .AppleDesktop
47 | Network Trash Folder
48 | Temporary Items
49 | .apdisk
50 | 


--------------------------------------------------------------------------------
/08-IdentifyingCode.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | 
 3 | image_name = r'result\08-IdentifyingCode\test1.jpg'
 4 | 
 5 | sx = 20
 6 | sy = 16
 7 | ex = 8
 8 | ey = 10
 9 | st = 20
10 | 
11 | def gc(a):
12 |     if a>180:
13 |         return 0
14 |     else:
15 |         return 1
16 | 
17 | def disp(im):
18 |     sizex, sizey = im.size
19 |     tz = []
20 |     for y in range(sizey):
21 |         t = []
22 |         for x in range(sizex):
23 |             t.append(gc(im.getpixel((x,y))))
24 |         tz.append(t)
25 |     for i in tz:
26 |         print('')
27 |         for l in i:
28 |             print(l, sep='', end='')
29 |     return tz
30 | 
31 | im = Image.open(image_name)
32 | im = im.convert('L')
33 | 
34 | im_new = []
35 | for i in range(5):
36 |     im1 = im.crop((sx+(i*st),sy,sx+ex+(i+st),sy+ey))
37 |     im_new.append(im1)
38 | 
39 | for i in im_new:
40 |     disp(i)
41 |     print('')
42 | 
43 | #input('')


--------------------------------------------------------------------------------
/09-downPicture.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 爬取某个网页上的所有图片资源
 3 | 
 4 | '''
 5 | import urllib.request
 6 | import socket
 7 | import re
 8 | import sys
 9 | import os
10 | 
11 | targetDir = r'result\09-downPicture'    #文件保存路径  
12 | 
13 | def destFile(path):
14 |     if not os.path.isdir(targetDir):
15 |         os.mkdir(targetDir)
16 |     pos = path.rindex('/')
17 |     t = os.path.join(targetDir, path[pos+1:])
18 |     return t
19 | 
20 | def downPicture(weburl):
21 |     webheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}   
22 |     req = urllib.request.Request(url=weburl, headers=webheaders)    #构造请求报头  
23 |     webpage = urllib.request.urlopen(req)   #发送请求报头  
24 |     contentBytes = webpage.read()
25 |     for link,t in set(re.findall(r'(http:[^\s]*?(jpg|png|gif))', str(contentBytes))):   #正则表达式查找所有的图片
26 |         print(link)
27 |         try:
28 |             urllib.request.urlretrieve(link, destFile(link))     #下载图片  
29 |         except :
30 |             print('失败')
31 | 
32 | if __name__ == '__main__':
33 |     weburl = 'http://www.douban.com/'
34 |     downPicture(weburl)


--------------------------------------------------------------------------------
/14-ChromePassword.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 获取Chrome浏览器已保存的账号和密码。
 3 | '''
 4 | # Chrome浏览器已保存的密码都保存在一个sqlite3数据库文件中，和Cookies数据库在同一个文件夹.
 5 | # C:\Users\Jueee\AppData\Local\Google\Chrome\User Data\Default\Login Data
 6 | '''
 7 | 使用CryptUnprotectData函数解密数据库中的密码字段，即可还原密码，只需要User权限，并且只能是User权限。
 8 | '''
 9 | '''
10 | 为了防止出现读写出错，建议先把数据库临时拷贝到当前目录。
11 | '''
12 | import os,sys
13 | import shutil
14 | import sqlite3
15 | import win32crypt
16 | 
17 | db_file_path = os.path.join(os.environ['LOCALAPPDATA'],r'Google\Chrome\User Data\Default\Login Data')
18 | print(db_file_path)
19 | 
20 | tmp_file = os.path.join(os.path.dirname(sys.executable),'tmp_tmp_tmp')
21 | print(tmp_file)
22 | if os.path.exists(tmp_file):
23 |     os.remove(tmp_file)
24 | shutil.copyfile(db_file_path,tmp_file)
25 | 
26 | conn = sqlite3.connect(tmp_file)
27 | for row in conn.execute('select signon_realm,username_value,password_value from logins'):
28 |     try:
29 |         ret = win32crypt.CryptUnprotectData(row[2],None,None,None,0)
30 |         print('网站：%-50s，用户名：%-20s，密码：%s' % (row[0][:50],row[1],ret[1].decode('gbk')))
31 |     except Exception as e:
32 |         print('获取Chrome密码失败...')
33 |         raise e
34 | conn.close()
35 | os.remove(tmp_file)
36 | 


--------------------------------------------------------------------------------
/24-FilesDownload.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 爬取文件集。
 3 | '''
 4 | import requests
 5 | import re,time,os
 6 | 
 7 | USER_NAMBER = 'yunzhan365'      # 子路径
 8 | targetDir = 'result\\24-FilesDownload.py\\'+USER_NAMBER    #文件保存路径  
 9 | 
10 | # 获取保存路径
11 | def destFile(path,name=''):
12 |     if not os.path.isdir(targetDir):
13 |         os.makedirs(targetDir)
14 |     pos = path.rindex('/')
15 |     pom = path.rindex('.')
16 |     if name=='':
17 |         t = os.path.join(targetDir, path[pos+1:])
18 |     else:
19 |         t = os.path.join(targetDir, name + '.' + path[pom+1:])
20 |     return t
21 | 
22 | # 保存图片
23 | def saveImage(imgUrl,name=''):
24 |     response = requests.get(imgUrl, stream=True)
25 |     image = response.content
26 |     imgPath = destFile(imgUrl,name)
27 |     try:
28 |         with open(imgPath ,"wb") as jpg:
29 |             jpg.write(image)
30 |             print('保存图片成功！%s' % imgPath)     
31 |             return
32 |     except IOError:
33 |         print('保存图片成功！%s' % imgUrl)   
34 |         return
35 |     finally:
36 |         jpg.close        
37 | 
38 | if __name__=='__main__':
39 |     for n in range(1,99):
40 |         album_url = 'https://book.yunzhan365.com/pcqz/stgm/files/mobile/'+str(n)+'.jpg'
41 |         saveImage(album_url, str(n).zfill(4))
42 |             


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 05-WebCrawlers
 2 | 网络爬虫（Web Crawlers）学习笔记。
 3 | 
 4 | ----------
 5 | 
 6 | ### 内容说明：
 7 | #### 1、Scrapy文件夹：
 8 | web抓取框架Scrapy学习笔记。
 9 | 
10 | #### 2、其他：
11 | + 01-URL.py：用Python抓取指定URL页面。  
12 | + 02-BFS.py：使用队列来实现爬虫的广度优先搜索(BFS)算法。  
13 | + 03-Chrome.py：伪装浏览器来访问网站。  
14 | + 04-Login.py：模拟用户登录（以登录 CSDN 网站为例）。
15 | + 05-tieba.py：爬取百度贴吧的HTML网页到本地。
16 | + 06-JDprice.py：爬虫获取京东的商品价格，并把爬取结果保存至Excel。
17 | + 07-BaiduLenovo.py：百度搜索框联想词的获取。
18 | + 08-IdentifyingCode.py：读取验证码图片。
19 | + 09-downPicture.py：爬取某个网页上的所有图片资源。
20 | + 10-zhihuLogin.py：知乎网的登录。
21 | + 11-CSDNBlogList.py：根据用户名，获取该用户的CSDN的博客列表。
22 | + 12-ChromeCookie.py：在Python中使用Chrome浏览器已有的Cookies发起HTTP请求。
23 | + 13-CookieDeciphering.py：Chrome 33+浏览器 Cookies encrypted_value 解密。
24 | + 14-ChromePassword.py：获取Chrome浏览器已保存的账号和密码。
25 | + 15-ZhihuAnswerList.py：获取某个用户的知乎回答列表及赞同数（静态网页爬虫）。
26 | + 16-selenium.py：动态爬虫工具 selenium 的安装与使用（通过控制浏览器实现）。
27 | + 17-Phantomjs.py：动态爬虫工具 Phantomjs 的安装与使用（通过JS渲染实现）。
28 | + 18-WeiboAnalbum.py：爬取新浪微博某个用户的头像相册（通过分析API JSON）。
29 | + 19-BeautifulSoup.py：Beautiful Soup 学习笔记(python3中的爬虫匹配神器)。
30 | + 20-DoubanMovieTop250.py：爬取豆瓣评分最高的250部电影(使用Beautiful Soup)。
31 | + 21-DoubanMovieTypeTop.py：按类别爬取豆瓣评分最高的电影(使用Beautiful Soup)。
32 | + 22-PyQuery.py：Python中PyQuery库的使用总结。
33 | + 23-C315Check.py：根据物流防伪码，查询所购商品是否正品。
34 | 


--------------------------------------------------------------------------------
/01-URL.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 用Python抓取指定页面
 3 | '''
 4 | #encoding:UTF-8
 5 | import urllib.request
 6 | 
 7 | 
 8 | '''
 9 | urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False)
10 | 
11 | urlopen()函数返回一个 http.client.HTTPResponse 对象
12 | '''
13 | # 用Python抓取指定页面的源码
14 | if __name__ != '__main__':
15 |     url = 'http://www.baidu.com'
16 |     data = urllib.request.urlopen(url)
17 |     print(data)
18 |     print(data.info())
19 |     print(type(data))
20 |     print(data.geturl())
21 |     print(data.getcode())
22 |     print(data.read())
23 | 
24 | # 获取页码状态和源码
25 | if __name__ != '__main__':
26 |     url = 'http://www.douban.com/'
27 |     req = urllib.request.Request(url)
28 |     req.add_header('User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25')
29 |     
30 |     with urllib.request.urlopen(req) as f:
31 |         print('status:',f.status,f.reason)
32 |         for k,v in f.getheaders():
33 |             print('%s:%s' % (k,v))
34 |         print(f.read().decode('utf-8'))
35 | 
36 | # 用Python简单处理URL
37 | '''
38 | data是一个字典, 然后通过urllib.parse.urlencode()来将data转换为 ‘word=Jecvay+Notes’的字符串, 最后和url合并为full_url,
39 | '''
40 | if __name__=='__main__':
41 |     data = {}
42 |     data['wd'] = 'ju  eee'
43 |     url_values = urllib.parse.urlencode(data)
44 |     url = 'http://www.baidu.com/s'
45 |     full_url = url + url_values
46 |     print(full_url)
47 | 
48 |     data = urllib.request.urlopen(full_url).read()
49 |     print(data)
50 | 


--------------------------------------------------------------------------------
/07-BaiduLenovo.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 百度搜索框理想词的获取
 3 | '''
 4 | 
 5 | import urllib.request
 6 | import re
 7 | 
 8 | def get_baidu_lenovo(codeStr):
 9 |     pass
10 |     # urllib的quote()方法控制对特殊字符的URL编码
11 |     # 如将"百度"编码为"%E7%99%BE%E5%BA%A6"
12 |     gjc = urllib.request.quote(codeStr) 
13 |     url = 'http://suggestion.baidu.com/su?wd=' + gjc  
14 |     headers = {  
15 |         'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',  
16 |         'Accept' : '*/*',  
17 |         'Connection' : 'Keep-Alive'  
18 |     }
19 |     
20 |     req = urllib.request.Request(url, headers=headers)
21 |     html = urllib.request.urlopen(req).read().decode('gbk')
22 |     lenovoStr = re.search(r's:\[(.*?)\]', html).group(1)
23 |     print('“%s”的理想词为：%s' % (codeStr, lenovoStr))
24 | 
25 | codelist = ['百度','谷歌','GitHub','老罗','韩寒','%']
26 | for i in codelist:
27 |     get_baidu_lenovo(i)
28 | '''
29 | 运行结果为：
30 | 
31 | “百度”的理想词为："百度云","百度翻译","百度地图","百度杀毒","百度卫士","百度音乐","百度网盘","百度文库","百度糯米","百度外卖"
32 | “谷歌”的理想词为："谷歌翻译","谷歌地图","谷歌浏览器","谷歌地球","谷歌学术","谷歌僵尸地图","谷歌浏览器官方下载","谷歌地图高清卫星地图","谷歌邮箱","谷歌搜索"
33 | “GitHub”的理想词为："github for windows","github 教程","github desktop","github 下载","github中文网","github使用教程","github for mac","github desktop 教程","github是什么","github删除repository"
34 | “老罗”的理想词为："老罗语录","老罗的android之旅","老罗英语培训","老罗android视频教程","老罗三句名言","老罗斯福","老罗android开发视频教程","老罗英语培训网站","老罗微博","老罗android"
35 | “韩寒”的理想词为："韩寒女儿","韩寒后会无期","韩寒 对话","韩寒吧","韩寒现象","韩寒经典语录","韩寒餐厅被罚","韩寒 白龙马","韩寒电影","韩寒博客"
36 | “%”的理想词为："%s","%d","%2c","%g","%a","%x","%windir%","%f","%20","%u"
37 | '''
38 | 


--------------------------------------------------------------------------------
/ChromeCookies.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 在Python中使用Chrome浏览器已有的Cookies发起HTTP请求。
 3 | 
 4 | 参考博客：http://blog.csdn.net/pipisorry/article/details/47980653
 5 | '''
 6 | import subprocess
 7 | import sqlite3
 8 | import win32crypt
 9 | import re,os
10 | import requests
11 | 
12 | def get_chrome_cookies(url):
13 |     DIST_COOKIE_FILENAME = '.\python-chrome-cookies'
14 |     SOUR_COOKIE_FILENAME = os.path.join(os.environ['LOCALAPPDATA'],r'Google\Chrome\User Data\Default\Cookies')
15 |     print(SOUR_COOKIE_FILENAME)
16 |     if not os.path.exists(SOUR_COOKIE_FILENAME):
17 |         raise Exception('Cookies 文件不存在...')
18 |     subprocess.call(['copy', SOUR_COOKIE_FILENAME, DIST_COOKIE_FILENAME], shell=True)
19 |     conn = sqlite3.connect(".\python-chrome-cookies")
20 |     ret_dict = {}
21 |     for row in conn.execute("SELECT host_key, name, path, value, encrypted_value FROM cookies"):
22 |         if __name__=='__main__':
23 |             print(row[0],row[1])
24 |         if row[0] != url:
25 |             continue
26 |         ret = win32crypt.CryptUnprotectData(row[4], None, None, None, 0)
27 |         ret_dict[row[1]] = ret[1].decode()
28 |     conn.close()
29 |     subprocess.call(['del', '.\python-chrome-cookies'], shell=True)
30 |     return ret_dict
31 | 
32 | # 使用方法参考
33 | if __name__=='__main__':
34 |     print('------使用requests进行解析访问------')
35 |     DOMAIN_NAME = '.zhihu.com'
36 |     get_url = r'https://www.zhihu.com/people/jueee/answers'
37 |     response = requests.get(get_url, cookies=get_chrome_cookies(DOMAIN_NAME))
38 |     
39 |     html_doc = response.text.encode('gbk','ignore').decode('gbk')
40 |     print(html_doc)
41 |     ###
42 |     for match in re.finditer(r'<a class="question_link".*?href="(.*?)">(.*?)</a>', html_doc):
43 |         link = match.group(1)
44 |         title = match.group(2)
45 |         print(link,title)
46 |     ###


--------------------------------------------------------------------------------
/23-C315Check.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 根据物流防伪码，查询所购商品是否正品
 3 | 
 4 | 中国质量检验协会防伪溯源和物流管理服务系统：http://www.c315.cn/
 5 | '''
 6 | 
 7 | from pyquery import PyQuery as pq
 8 | import re,random
 9 | import urllib.request
10 | 
11 | 
12 | # 进行校验
13 | def get_check(id):
14 |     url='http://www.c315.cn/test2.asp?imageField.x=10&imageField.y=8&textfield2=&textfield='+id
15 |     html = urllib.request.urlopen(url).read().decode('gbk')
16 |     m = re.search(r'<td width="570">.*?<p align="left">(.*?)</td>',html,re.S).group(0)
17 |     d = pq(m)
18 |     text = d('td').text()
19 |     print('您的质检码为：%s，质检结果为：%s' % (id,text))
20 | 
21 | # 随机生成20为质检码
22 | def get_random():
23 |     seed = '0123456789'
24 |     sa = ''
25 |     for i in range(20):
26 |         sa += random.choice(seed)
27 |     return sa
28 | 
29 | # 主程序
30 | if __name__ == '__main__':
31 |     for x in range(10):
32 |         get_check(get_random())
33 | '''
34 | 您的质检码为：07157880942653588411，质检结果为：数码无效，谨防假冒！--国家质检总局/中国质量检验协会防伪溯源物流管理服务中心。
35 | 您的质检码为：48504867345475249029，质检结果为：数码无效，谨防假冒！--国家质检总局/中国质量检验协会防伪溯源物流管理服务中心。
36 | 您的质检码为：70715268433244542717，质检结果为：数码无效，谨防假冒！--国家质检总局/中国质量检验协会防伪溯源物流管理服务中心。
37 | 您的质检码为：17657087774562721449，质检结果为：数码无效，谨防假冒！--国家质检总局/中国质量检验协会防伪溯源物流管理服务中心。
38 | 您的质检码为：40862402100150573973，质检结果为：数码无效，谨防假冒！--国家质检总局/中国质量检验协会防伪溯源物流管理服务中心。
39 | 您的质检码为：32295594508977858364，质检结果为：数码无效，谨防假冒！--国家质检总局/中国质量检验协会防伪溯源物流管理服务中心。
40 | 您的质检码为：21980233972863266518，质检结果为：数码无效，谨防假冒！--国家质检总局/中国质量检验协会防伪溯源物流管理服务中心。
41 | 您的质检码为：32455966347223517644，质检结果为：数码无效，谨防假冒！--国家质检总局/中国质量检验协会防伪溯源物流管理服务中心。
42 | 您的质检码为：98826997478893667764，质检结果为：数码无效，谨防假冒！--国家质检总局/中国质量检验协会防伪溯源物流管理服务中心。
43 | 您的质检码为：86009268066314845474，质检结果为：数码无效，谨防假冒！--国家质检总局/中国质量检验协会防伪溯源物流管理服务中心。
44 | '''
45 | if __name__ != '__main__':
46 |     get_check('66254574712580337665')
47 | 
48 | '''
49 | 您的质检码为：66254574712580337665，质检结果为：数码已经被查询过,谨防假冒！--国家质检总局/中国质量检验协会防伪溯源物流管理服务中心。
50 | '''


--------------------------------------------------------------------------------
/16-selenium.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 动态爬虫工具 selenium 的安装与使用（通过控制浏览器实现）。
 3 | '''
 4 | '''
 5 | selenium 安装：pip install selenium
 6 | '''
 7 | '''
 8 | selenium 调用浏览器：
 9 | 调用 Firefox：直接调用 webdriver.Firefox()
10 | 调用 Chrome：安装chromedriver.exe后调用 webdriver.Firefox()
11 |              chromedriver.exe下载路径："https://sites.google.com/a/chromium.org/chromedriver/downloads"
12 |              需要将chromedriver.exe 放在chrome浏览器安装目录下，或放到python的安装目录。
13 |             （同时设置用户环境变量path:"C:\\Users\\xxxxxx\\AppData\\Local\\Google\\Chrome\\Application";）
14 | 调用 IE：安装IEDriverServer.exe后webdriver.Ie()
15 |          需要将IEDriverServer.exe 放在ie浏览器安装目录下
16 |         （同时设置用户环境变量path："C:\\Program Files\\Internet Explorer" ）
17 | '''
18 | '''
19 | 通过id定位元素：find_element_by_id("id_vaule")
20 | 通过name定位元素：find_element_by_name("name_vaule")
21 | 通过tag_name定位元素：find_element_by_tag_name("tag_name_vaule")
22 | 通过class_name定位元素：find_element_by_class_name("class_name")
23 | 通过css定位元素：find_element_by_css_selector();用css定位是比较灵活的
24 | 通过xpath定位元素：find_element_by_xpath("xpath")
25 | 通过link定位：find_element_by_link_text("text_vaule")或者find_element_by_partial_link_text()
26 | '''
27 | from selenium import webdriver
28 | from selenium.common.exceptions import NoSuchElementException
29 | from selenium.webdriver.common.keys import Keys
30 | 
31 | # selenium 调用浏览器并模拟百度的操作
32 | if __name__ != '__main__':
33 |     driver = webdriver.Chrome()
34 |     driver.get("http://www.baidu.com")
35 |     driver.find_element_by_id('kw').send_keys('Python开发')
36 |     driver.find_element_by_id('su').click()
37 |     time.sleep(5)
38 |     driver.close()
39 | 
40 | # 打印cookie信息
41 | if __name__ != '__main__':
42 |     driver = webdriver.Chrome()
43 |     driver.get("http://www.baidu.com")
44 |     cookie= driver.get_cookies()
45 |     print(cookie)
46 |     driver.close()
47 | 
48 | # Selenium+PhantomJS使用：
49 | if __name__ == '__main__':
50 |     driver = webdriver.PhantomJS()
51 |     driver.get("http://www.baidu.com")
52 |     print(driver.title)
53 |     driver.quit()


--------------------------------------------------------------------------------
/18-WeiboAnalbum.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 爬取新浪微博某个用户的头像相册（通过分析API JSON）。
 3 | '''
 4 | '''
 5 | 
 6 | '''
 7 | import ChromeCookies
 8 | import requests
 9 | import re,time,os
10 | 
11 | 
12 | USER_NAMBER = '5234257700'      # 微博ID，如“1955032717”
13 | 
14 | targetDir = 'result\\18-WeiboAnalbum.py\\'+USER_NAMBER    #文件保存路径  
15 | 
16 | # 获取保存路径
17 | def destFile(path,name=''):
18 |     if not os.path.isdir(targetDir):
19 |         os.makedirs(targetDir)
20 |     pos = path.rindex('/')
21 |     pom = path.rindex('.')
22 |     if name=='':
23 |         t = os.path.join(targetDir, path[pos+1:])
24 |     else:
25 |         t = os.path.join(targetDir, name + '.' + path[pom+1:])
26 |     return t
27 | 
28 | # 保存图片
29 | def saveImage(imgUrl,name=''):
30 |     response = requests.get(imgUrl, stream=True)
31 |     image = response.content
32 |     imgPath = destFile(imgUrl,name)
33 |     try:
34 |         with open(imgPath ,"wb") as jpg:
35 |             jpg.write(image)
36 |             print('保存图片成功！%s' % imgPath)     
37 |             return
38 |     except IOError:
39 |         print('保存图片成功！%s' % imgUrl)   
40 |         return
41 |     finally:
42 |         jpg.close        
43 | 
44 | if __name__=='__main__':
45 |     DOMAIN_NAME = '.weibo.com'
46 |     cookies = ChromeCookies.get_chrome_cookies(DOMAIN_NAME)
47 |     album_url = 'http://photo.weibo.com/photos/get_latest?uid='+USER_NAMBER
48 |     response = requests.get(album_url, cookies=cookies)
49 |     html_doc = response.text.encode('gbk','ignore').decode('gbk')
50 |     imgnum = re.search(r'"total":(.*?),', html_doc).group(1)
51 |     print(imgnum)
52 |     for n in range(int(imgnum)//20+1):
53 |         page = n+1
54 |         get_url = album_url + '&page='+str(page)
55 |         response = requests.get(get_url, cookies=cookies)
56 | 
57 |         html_doc = response.text.encode('gbk','ignore').decode('gbk')
58 |         m = 0
59 |         for match in re.finditer(r'"pic_name":"(.*?)"', html_doc,re.S):
60 |             m = m + 1
61 |             picture = match.group(1)
62 |             pictureurl = 'http://ww3.sinaimg.cn/mw690/'+picture
63 |             saveImage(pictureurl, str(m+n*20).zfill(4))


--------------------------------------------------------------------------------
/12-ChromeCookie1.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 在Python中使用Chrome浏览器已有的Cookies发起HTTP请求。
 3 | 
 4 | 参考博客：http://www.lijiejie.com/python-http-request-with-chrome-cookies/
 5 | '''
 6 | '''
 7 | Chrome的Cookies文件保存路径类似于:
 8 | '''
 9 | #C:\Users\Jueee\AppData\Local\Google\Chrome\User Data\Default\Cookies
10 | #其中C:\Users\Jueee\AppData可通过环境变量os.environ[‘LOCALAPPDATA’]获取。
11 | '''
12 | Cookies是一个Sqlite3数据库文件。
13 | 了解完上述事实，问题就非常简单了：
14 | 从数据库中查询到所需的Cookies，更新到一个CookieJar对象中。再使用这个CookieJar创建opener即可。
15 | '''
16 | 
17 | import os
18 | import sqlite3
19 | import http.cookiejar
20 | import http.cookies
21 | import urllib.request
22 | import re
23 | import win32crypt
24 | import requests
25 | 
26 | def get_chrome_cookies(domain=None):
27 |     cookie_file_path = os.path.join(os.environ['LOCALAPPDATA'],r'Google\Chrome\User Data\Default\Cookies')
28 |     print('Cookies文件的地址为：%s' % cookie_file_path)
29 |     if not os.path.exists(cookie_file_path):
30 |         raise Exception('Cookies 文件不存在...')
31 |     coon = sqlite3.connect(cookie_file_path)
32 |     sql = 'select host_key,name,value,path,encrypted_value from cookies'
33 |     if domain:
34 |         sql += ' where host_key like "%{}%"'.format(domain)
35 |     cookiejar = http.cookiejar.CookieJar()
36 |     for row in coon.execute(sql):
37 |         ret = win32crypt.CryptUnprotectData(row[4], None, None, None, 0)
38 |         cookie_item = http.cookiejar.Cookie(
39 |             version=0,
40 |             name=row[1],
41 |             value=ret[1].decode(),
42 |             port=None,
43 |             port_specified=None,
44 |             domain=row[0],
45 |             domain_specified=None,
46 |             domain_initial_dot=None,
47 |             path=row[3],
48 |             path_specified=None,
49 |             secure=None,
50 |             expires=None,
51 |             discard=None,
52 |             comment=None,
53 |             comment_url=None,
54 |             rest=None,
55 |             rfc2109=False
56 |         )
57 |         cookiejar.set_cookie(cookie_item)
58 |     coon.close()
59 |     return urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookiejar))
60 | 
61 | 
62 | get_url='http://www.zhihu.com/people/jueee/answers'
63 | opener = get_chrome_cookies('.zhihu.com')
64 | html_doc = opener.open(get_url).read().decode('utf-8','ignore')
65 | 
66 | print(html_doc)
67 | 
68 | 
69 | for match in re.finditer(r'<a class="question_link".*?href="(.*?)">(.*?)</a>', html_doc):
70 |     link = match.group(1)
71 |     title = match.group(2)
72 |     print(link,title)


--------------------------------------------------------------------------------
/12-ChromeCookie2.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 在Python中使用Chrome浏览器已有的Cookies发起HTTP请求。
 3 | 
 4 | 参考博客：http://blog.csdn.net/pipisorry/article/details/47980653
 5 | '''
 6 | import subprocess
 7 | import sqlite3
 8 | import win32crypt
 9 | import re,os
10 | import requests
11 | 
12 | def get_chrome_cookies(url):
13 |     DIST_COOKIE_FILENAME = '.\python-chrome-cookies'
14 |     SOUR_COOKIE_FILENAME = os.path.join(os.environ['LOCALAPPDATA'],r'Google\Chrome\User Data\Default\Cookies')
15 |     print('Cookies文件的地址为：%s' % SOUR_COOKIE_FILENAME)
16 |     if not os.path.exists(SOUR_COOKIE_FILENAME):
17 |         raise Exception('Cookies 文件不存在...')
18 |     subprocess.call(['copy', SOUR_COOKIE_FILENAME, DIST_COOKIE_FILENAME], shell=True)
19 |     conn = sqlite3.connect(".\python-chrome-cookies")
20 |     ret_dict = {}
21 |     for row in conn.execute("SELECT host_key, name, path, value, encrypted_value FROM cookies"):
22 |         if row[0] != url:
23 |             continue
24 |         ret = win32crypt.CryptUnprotectData(row[4], None, None, None, 0)
25 |         ret_dict[row[1]] = ret[1].decode()
26 |     conn.close()
27 |     subprocess.call(['del', '.\python-chrome-cookies'], shell=True)
28 |     return ret_dict
29 | 
30 | # 可以无障碍使用
31 | if __name__=='__main__':
32 |     print('------使用requests进行解析访问------')
33 |     DOMAIN_NAME = '.zhihu.com'
34 |     get_url = r'https://www.zhihu.com/people/jueee/answers'
35 |     response = requests.get(get_url, cookies=get_chrome_cookies(DOMAIN_NAME))
36 |     
37 |     html_doc = response.text.encode('gbk','ignore').decode('gbk')
38 |     # print(html_doc)
39 |     
40 |     for match in re.finditer(r'<a class="question_link".*?href="(.*?)">(.*?)</a>', html_doc):
41 |         link = match.group(1)
42 |         title = match.group(2)
43 |         print(link,title)
44 | 
45 | 
46 | # 有点小问题，无法登陆
47 | if __name__=='__main__':
48 |     print('------使用 urllib.request 进行解析访问------')
49 |     import urllib.request
50 |     
51 |     DOMAIN_NAME = '.zhihu.com'
52 |     get_url = r'https://www.zhihu.com/people/jueee/answers'
53 |     headers = {'Cookie': ['='.join((i, j)) for i, j in get_chrome_cookies(DOMAIN_NAME).items()][0]}
54 |     request = urllib.request.Request(get_url, headers=headers)
55 |     response = urllib.request.urlopen(request)
56 |     html_doc = response.read().decode().encode('gbk','ignore').decode('gbk')
57 |     print(html_doc)
58 |     
59 |     for match in re.finditer(r'<a class="question_link".*?href="(.*?)">(.*?)</a>', html_doc):
60 |         link = match.group(1)
61 |         title = match.group(2)
62 |         print(link,title)
63 | 


--------------------------------------------------------------------------------
/10-zhihuLogin.py:
--------------------------------------------------------------------------------
 1 | from urllib import request
 2 | import re, time,requests
 3 | import os,json
 4 | 
 5 | _Zhihu_URL = 'https://www.zhihu.com'
 6 | _Login_URL = _Zhihu_URL + '/#signin'
 7 | _Captcha_URL_Prefix = _Zhihu_URL + '/captcha.gif?r=' 
 8 | _Cookies_File_Name = 'cookies.json'
 9 | 
10 | _session = None
11 | _header = {'X-Requested-With': 'XMLHttpRequest',
12 |            'Referer': 'https://www.zhihu.com',
13 |            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36',
14 |            'Accept-Encoding': 'gzip, deflate',
15 |            'Host': 'www.zhihu.com'}
16 | 
17 | def getXSRF(data):
18 |     _xsrf = re.search('name=\"_xsrf\" value=\"(.*?)\"', data).group(1)
19 |     return _xsrf
20 | 
21 | def get_captcha_url():
22 |     return _Captcha_URL_Prefix + str(int(time.time() * 1000))
23 | 
24 | def save_captcha(url):
25 |     global _session
26 |     r = _session.get(url)
27 |     with open('code.gif', 'wb') as f:
28 |         f.write(r.content)
29 | 
30 | # 不使用cookies.json，手动登陆知乎
31 | def login(email='',password='',captcha='',savecookies=True):
32 |     global _session
33 |     global _header
34 |     data = {'email':email,'password':password,'remember_me':'true','captcha':captcha}
35 |     print(_Login_URL)
36 |     r = _session.post(_Login_URL, data=data)
37 |     print(r.text)
38 |     j = r.json()
39 |     c = int(j['r'])
40 |     m = j['msg']
41 |     if c==0 and savecookies is True:
42 |         with open(_Cookies_File_Name,'w') as f:
43 |             json.dump(_session.cookies.get_dict(),f)
44 |     return c,m
45 | 
46 | # 创建cookies文件
47 | def create_cookies():
48 |     if os.path.isfile(_Cookies_File_Name) is False:
49 |         email='921550356@qq.com'
50 |         password='balabala'
51 |         url=get_captcha_url()
52 |         save_captcha(url)
53 |         print('已经生成验证码')
54 |     #    captcha = input('captcha:')
55 |         captcha='as'
56 |         code, msg = login(email,password,captcha)
57 |         if code == 0:
58 |             print('cookies 文件创建成功！')
59 |         else:
60 |             print(msg)
61 |         os.remove('code.gif')
62 |     else:
63 |         print('请先删除验证码文件：[%s]' % _Cookies_File_Name)
64 | 
65 | def init():
66 |     global _session
67 |     if _session is None:
68 |         _session = requests.session()
69 |         _session.headers.update(_header)
70 |         if os.path.isfile(_Cookies_File_Name):
71 |             with open(_Cookies_File_Name,'r') as f:
72 |                 cookie_dict = json.load(f)
73 |                 _session.cookies.update(cookie_dict)
74 |         else:
75 |             print('没有cookies文件。')
76 |             print('您需要运行 create_cookies 或进行登录') 
77 |           #  _session.post(_Login_URL, data={})
78 |             create_cookies()
79 |     else:
80 |         raise Exception('call init func two times')
81 | 
82 | init()
83 | def aa():
84 |     global _session
85 |     r = _session.get(_Login_URL)
86 |     print(r)
87 | aa()


--------------------------------------------------------------------------------
/15-ZhihuAnswerList.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 获取某个用户的知乎回答列表及赞同数
 3 | '''
 4 | 
 5 | import ChromeCookies
 6 | import requests
 7 | import re
 8 | 
 9 | ZHIHU_URL = 'https://www.zhihu.com'
10 | USER_NAME = 'wangnuonuo'
11 | 
12 | if __name__=='__main__':
13 |     DOMAIN_NAME = '.zhihu.com'
14 |     cookies = ChromeCookies.get_chrome_cookies(DOMAIN_NAME)
15 |     get_url = ZHIHU_URL + r'/people/'+USER_NAME+'/answers'
16 |     response = requests.get(get_url, cookies=cookies)
17 |     html_doc = response.text.encode('gbk','ignore').decode('gbk')
18 |     answersNum = re.search(r'回答\s<span class="num">(.*?)</span>', html_doc).group(1)
19 |     for num in range(int(answersNum)//20+1):
20 |         answers_url = get_url + '?page='+str(num+1)
21 |         print(answers_url)
22 |         response_answer = requests.get(answers_url, cookies=cookies)
23 |         html_answer = response_answer.text.encode('gbk','ignore').decode('gbk')
24 |         for answermatch in re.finditer(r'<div class="zm-item"([\s\S]*?)<i class="z-icon-fold">', html_answer,re.S):
25 |             answer = answermatch.group(1)
26 |             match1 = re.search(r'<a class="question_link" href="(.*?)">(.*?)</a>', answer)
27 |             link = match1.group(1)
28 |             title = match1.group(2)
29 |             match2 = re.search(r'<a class="zm-item-vote-count js-expand js-vote-count".*?>(.*?)</a>', answer)
30 |             vote = match2.group(1)
31 |             match3 = re.search(r'<a class="answer-date-link[\s\S]*?>(.*?)</a>', answer)
32 |             date = match3.group(1)
33 |             print('问题链接：%s，%s，赞同数：%6s，标题：%-80s' % (link,date,vote,title))
34 | 
35 | '''
36 | 运行结果：
37 | 
38 | 
39 | https://www.zhihu.com/people/wangnuonuo/answers?page=1
40 | 问题链接：/question/38821362/answer/79145656，发布于 昨天 21:20，赞同数：   215，标题：有哪些尚未普及却非常好用的东西？                                                                
41 | 问题链接：/question/30566381/answer/77633766，编辑于 2015-12-20，赞同数：  2624，标题：中国女权发展面临的最大挑战是什么？                                                               
42 | 问题链接：/question/35969990/answer/76977890，编辑于 2015-12-16，赞同数：    55，标题：如何评价《万万没想到西游篇》大电影？                                                              
43 | 问题链接：/question/19914257/answer/76360974，编辑于 2015-12-11，赞同数：   313，标题：三文鱼怎么做比较好吃？                                                                     
44 | 问题链接：/question/20842860/answer/75644153，发布于 2015-12-07，赞同数：   356，标题：猫屎咖啡的确是来源于猫的粪便，为什么人类偏偏独爱猫屎，而不是其他动物屎？猪屎怎么样？                                      
45 | 问题链接：/question/38195617/answer/75625923，编辑于 2015-12-07，赞同数：   622，标题：你有过「我要死了吗」的体验吗？                                                                 
46 | 问题链接：/question/37201429/answer/73641915，编辑于 2015-12-07，赞同数：  1019，标题：「少数服从多数」这种思维的适用性有多强？                                                            
47 | 问题链接：/question/37065743/answer/72566118，编辑于 2015-11-17，赞同数：  3692，标题：怎么样做可以当网红？                                                                      
48 | 问题链接：/question/36601352/answer/71744930，编辑于 2015-11-11，赞同数：   665，标题：如何评价王诺诺（朱妍桥）在中华小姐比赛上的表现？
49 | 
50 | '''


--------------------------------------------------------------------------------
/02-BFS.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | http://www.yiibai.com/python/python3-webbug-series2.html
  3 | '''
  4 | 
  5 | '''
  6 | Python的队列
  7 | 
  8 | 在爬虫程序中, 用到了广度优先搜索(BFS)算法. 这个算法用到的数据结构就是队列.
  9 | 
 10 | Python的List功能已经足够完成队列的功能, 可以用 append() 来向队尾添加元素, 可以用类似数组的方式来获取队首元素, 可以用 pop(0) 来弹出队首元素. 
 11 | 
 12 | 但是List用来完成队列功能其实是低效率的, 因为List在队首使用 pop(0) 和 insert() 都是效率比较低的.
 13 | Python官方建议使用collection.deque来高效的完成队列任务.
 14 | '''
 15 | from collections import deque
 16 | 
 17 | queue = deque(['Eric','John','Michael'])
 18 | queue.append('Terry')
 19 | queue.append('Graham')
 20 | queue.popleft()
 21 | print(queue)
 22 | queue.popleft()
 23 | print(queue)
 24 | 
 25 | 
 26 | 
 27 | '''
 28 | Python的集合
 29 | 
 30 | 在爬虫程序中, 为了不重复爬那些已经爬过的网站, 我们需要把爬过的页面的url放进集合中.
 31 | 在每一次要爬某一个url之前, 先看看集合里面是否已经存在. 
 32 | 如果已经存在, 我们就跳过这个url; 如果不存在, 我们先把url放入集合中, 然后再去爬这个页面.
 33 | 
 34 | Python提供了set这种数据结构. set是一种无序的, 不包含重复元素的结构. 
 35 | 一般用来测试是否已经包含了某元素, 或者用来对众多元素们去重. 
 36 | 与数学中的集合论同样, 他支持的运算有交, 并, 差, 对称差.
 37 | 
 38 | 
 39 | 创建一个set可以用 set() 函数或者花括号 {} . 
 40 | 但是创建一个空集是不能使用一个花括号的, 只能用 set() 函数. 
 41 | 因为一个空的花括号创建的是一个字典数据结构. 
 42 | '''
 43 | basket = {'apple', 'orange', 'apple', 'pear', 'orange', 'banana'}
 44 | print(basket)   # 这里演示的是去重功能
 45 | print('orange' in basket)  # 快速判断元素是否在集合内
 46 | print('ornage' in basket)  # 快速判断元素是否在集合内
 47 | 
 48 | # 下面展示两个集合间的运算.
 49 | a = set('abracadabra')
 50 | b = set('alacazam')
 51 | print(a)        # 集合a中包含元素
 52 | print(a - b)
 53 | print(a|b)      # 集合a或b中包含的所有元素
 54 | print(a & b)    # 集合a和b中都包含了的元素
 55 | print(a ^ b)    # 不同时包含于a和b的元素
 56 | 
 57 | 
 58 | '''
 59 | Python的正则表达式
 60 | 
 61 | 在爬虫程序中, 爬回来的数据是一个字符串, 字符串的内容是页面的html代码. 
 62 | 我们要从字符串中, 提取出页面提到过的所有url. 
 63 | 这就要求爬虫程序要有简单的字符串处理能力, 而正则表达式可以很轻松的完成这一任务.
 64 | 
 65 | http://deerchao.net/tutorials/regex/regex.html
 66 | http://www.w3cschool.cc/python/python-reg-expressions.htmll
 67 | http://www.cnblogs.com/huxi/archive/2010/07/04/1771073.html
 68 | '''
 69 | 
 70 | 
 71 | '''
 72 | Python网络爬虫Ver 1.0 alpha
 73 | '''
 74 | import re
 75 | import urllib.request
 76 | import urllib
 77 | from collections import deque
 78 | 
 79 | queue = deque()
 80 | visited = set()
 81 | 
 82 | url = 'http://www.cnblogs.com/huxi/archive/2010/07/04/1771073.html'  # 入口页面, 可以换成别的
 83 | 
 84 | queue.append(url)
 85 | cnt = 0
 86 | 
 87 | while queue:
 88 |     url = queue.popleft()   # 队首元素出队
 89 |     visited |= {url}        # 标记为已访问
 90 | 
 91 |     print('已经抓取：'+str(cnt)+'正在抓取<---'+url)
 92 |     cnt += 1
 93 |     urlop = urllib.request.urlopen(url)
 94 |     # 用getheader()函数来获取抓取到的文件类型, 是html再继续分析其中的链接
 95 |     if 'html' not in urlop.getheader('Content-Type'):
 96 |         continue
 97 |     
 98 | 
 99 |     # 避免程序异常中止, 用try..catch处理异常
100 |     try:
101 |         data = urlop.read().decode('utf-8')
102 |     except:
103 |         continue
104 | 
105 |     # 正则表达式提取页面中所有队列, 并判断是否已经访问过, 然后加入待爬队列
106 |     linkre = re.compile('href=\"(.+?)\"')
107 |     for x in linkre.findall(data):
108 |         if 'http' in x and x not in visited:
109 |             queue.append(x)
110 |             print('加入队列 --->  ' + x)
111 | 
112 | 
113 | '''
114 | 爬虫是可以工作了, 但是在碰到连不上的链接的时候, 它并不会超时跳过. 
115 | 而且爬到的内容并没有进行处理, 没有获取对我们有价值的信息, 也没有保存到本地. 
116 | 下次我们可以完善这个alpha版本.
117 | '''
118 | 
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/05-tieba.py:
--------------------------------------------------------------------------------
 1 | from urllib import request, error, parse
 2 | import re, hashlib, os
 3 | 
 4 | # 百度贴吧爬虫
 5 | class Baidu_tieba(object):
 6 |     contentType = ''    # 资源类型
 7 |     charset = ''        # 资源编码
 8 |     filepath = 'result\\02-tieba\\'       # 文件路径
 9 |     
10 |     def get_content_header(self, url):
11 |         headers = {  
12 |             'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',  
13 |             'Accept' : '*/*',  
14 |             'Connection' : 'Keep-Alive'  
15 |         }
16 |         req = request.Request(url, headers = headers)
17 |         try:
18 |             resp = request.urlopen(req)
19 |         except error.URLError as e:
20 |             print(e.reason + ':' + e.code)
21 |         else:
22 |             m = re.search('\w+/(\w+).*=(.*)', resp.headers['Content-Type'])
23 |             if m:
24 |                 self.contentType = m.group(1)
25 |                 self.charset =  m.group(2)
26 |             else:
27 |                 print('没有获取文件类型和编码')
28 |             urlstream = resp.read()
29 |         finally:
30 |             resp.close()
31 |         return urlstream
32 | 
33 |     # 抓取资源
34 |     def fetchtieba(self, url):
35 |         flag = False    # 是否活得页数
36 |         i = 1
37 |         temp = url
38 |         while True:
39 |             params = parse.urlencode({'pn':i})
40 |             url = temp + '?' + str(params)
41 |             res = self.get_content_header(url)
42 |             print('正在抓取：%s' % url)
43 |             if flag == False:
44 |                 pages, flag = self.getpages(res)
45 |                 self.storeResourse(i, res, url)
46 |             else:
47 |                 self.storeResourse(i, res, url)
48 |             if i < pages:
49 |                 i += 1
50 |                 url = ''
51 |             else:
52 |                 print('已经抓取完毕！')
53 |                 break
54 | 
55 |     # 正则获取该贴吧某话题的页面数
56 |     def getpages(self, stream):
57 |         # 将获取的字符串strTxt做decode时，指明ignore，会忽略非法字符
58 |         s = stream.decode(self.charset,'ignore')
59 |         pattern = re.compile('<span class="red">(\d+)</span>')   #正则表达式
60 |         match = pattern.search(s)   # 注意search与match的区别  
61 |         res = [0, False]
62 |         if match:
63 |             pages = int(match.group(1))
64 |             res = [pages, True]
65 |         else:
66 |             print('没有获取页面数！')
67 |         return res
68 | 
69 |     # 存储资源
70 |     def storeResourse(self, i, stream, url = ''):
71 |         md5 = hashlib.md5()     # 生成md5文件名
72 |         md5.update(str(i).encode(encoding='utf-8', errors='strict'))
73 |         if (os.path.exists(self.filepath) == False):
74 |             os.mkdir(self.filepath)
75 |         pattern = re.compile('(\d+\?pn=\d+)')   #正则表达式
76 |         match = pattern.search(url)   # 注意search与match的区别  
77 |         res = [0, False]
78 |         if match:
79 |             name = match.group(1).replace('?', '')
80 |         else:
81 |             name = md5.hexdigest()
82 |         filename = self.filepath + name + '.' + self.contentType
83 |         f = open(filename, mode='wb+')
84 |         f.write(stream)
85 |         f.close()
86 | 
87 | 
88 | baidu = Baidu_tieba()   #实例化对象  
89 | # 输入参数
90 | #input('请输入百度贴吧的地址(http://tieba.baidu.com/p/2782298181):\n')  
91 | bdurl = 'http://tieba.baidu.com/p/1667806599'  
92 | # 调用
93 | baidu.fetchtieba(bdurl)
94 | 


--------------------------------------------------------------------------------
/Scrapy/tutorial/tutorial/settings.py.tmpl:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for $project_name project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = '$project_name'
13 | 
14 | SPIDER_MODULES = ['$project_name.spiders']
15 | NEWSPIDER_MODULE = '$project_name.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = '$project_name (+http://www.yourdomain.com)'
20 | 
21 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
22 | #CONCURRENT_REQUESTS=32
23 | 
24 | # Configure a delay for requests for the same website (default: 0)
25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
26 | # See also autothrottle settings and docs
27 | #DOWNLOAD_DELAY=3
28 | # The download delay setting will honor only one of:
29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
30 | #CONCURRENT_REQUESTS_PER_IP=16
31 | 
32 | # Disable cookies (enabled by default)
33 | #COOKIES_ENABLED=False
34 | 
35 | # Disable Telnet Console (enabled by default)
36 | #TELNETCONSOLE_ENABLED=False
37 | 
38 | # Override the default request headers:
39 | #DEFAULT_REQUEST_HEADERS = {
40 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
41 | #   'Accept-Language': 'en',
42 | #}
43 | 
44 | # Enable or disable spider middlewares
45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
46 | #SPIDER_MIDDLEWARES = {
47 | #    '$project_name.middlewares.MyCustomSpiderMiddleware': 543,
48 | #}
49 | 
50 | # Enable or disable downloader middlewares
51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
52 | #DOWNLOADER_MIDDLEWARES = {
53 | #    '$project_name.middlewares.MyCustomDownloaderMiddleware': 543,
54 | #}
55 | 
56 | # Enable or disable extensions
57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
58 | #EXTENSIONS = {
59 | #    'scrapy.telnet.TelnetConsole': None,
60 | #}
61 | 
62 | # Configure item pipelines
63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
64 | #ITEM_PIPELINES = {
65 | #    '$project_name.pipelines.SomePipeline': 300,
66 | #}
67 | 
68 | # Enable and configure the AutoThrottle extension (disabled by default)
69 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
70 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
71 | #AUTOTHROTTLE_ENABLED=True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY=5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY=60
76 | # Enable showing throttling stats for every response received:
77 | #AUTOTHROTTLE_DEBUG=False
78 | 
79 | # Enable and configure HTTP caching (disabled by default)
80 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
81 | #HTTPCACHE_ENABLED=True
82 | #HTTPCACHE_EXPIRATION_SECS=0
83 | #HTTPCACHE_DIR='httpcache'
84 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
85 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
86 | 


--------------------------------------------------------------------------------
/06-JDprice.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 简单爬虫获取京东的商品价格
  3 | '''
  4 | 
  5 | '''
  6 | 方法：通过京东移动商城（因为它没有把价格藏在js中）
  7 | '''
  8 | import urllib.request
  9 | import re
 10 | import xlwt
 11 | 
 12 | class JDGoods(object):
 13 |     jdid = ''
 14 |     jdname = ''
 15 |     jdprice = ''
 16 |     def __init__(self):
 17 |         super(JDGoods, self).__init__()
 18 |         
 19 | 
 20 | # 通过京东移动接口  
 21 | # 参数url：京东原本的商品网址  
 22 | def get_jd_price(id):
 23 |     jdGoods = JDGoods()
 24 |     url = 'http://item.jd.com/'+str(id)+'.html'        #原本的网址 
 25 |     jdGoods.jdid = re.search(r'/(\d+)\.html', url).group(1)      #原本的网址提取出商品ID
 26 |     
 27 |     url = 'http://m.jd.com/product/'+jdGoods.jdid+'.html'   #转换成为移动商城的url  
 28 |     #通过对源代码进行utf-8解码  
 29 |     html = urllib.request.urlopen(url).read().decode('utf-8')
 30 |     
 31 |     # 获取重定向后的地址
 32 |     try:
 33 |         url = re.search(r'returnUrl=(.*)\"', html).group(1)
 34 |         html = urllib.request.urlopen(url).read().decode('utf-8')
 35 |     except:
 36 |         pass
 37 |     
 38 |     # 提取商品名称
 39 |     m = re.search(r'<span class="title-text">(.*?)<i.*?/i></span>',html,re.S)
 40 |     if m:
 41 |         jdGoods.jdname = m.group(1)
 42 |     
 43 |     # 提取出商品价格
 44 |     # 匹配时指定re.S可以让点匹配所有字符，包括换行符
 45 |     m = re.search(r'<div class="prod-price">.*?<span>(.*?)</span>([^\s]*).*</div>',html,re.S)
 46 |     if m:
 47 |         jdGoods.jdprice = m.group(2)
 48 |     
 49 |     print('商品ID：%s' % jdGoods.jdid)
 50 |     print('商品名称：%s' % jdGoods.jdname)
 51 |     print('商品价格：%s' % jdGoods.jdprice)
 52 |     return jdGoods
 53 | 
 54 | if __name__ != '__main__':
 55 |     id = 1119429
 56 |     get_jd_price(id)
 57 |     
 58 |     for x in range(1,10):
 59 |         id = 1119452 + x
 60 |         get_jd_price(id)
 61 | 
 62 | '''
 63 | 运行效果：
 64 | 
 65 | 商品ID：1119429
 66 | 商品名称：丹姿水密码冰川矿泉洁肤晶露100g（洗面奶 深层清洁 温和保湿）  
 67 | 商品价格：9.90                              
 68 |                                             
 69 | 商品ID：1119453
 70 | 商品名称：惠普（HP） CN053AA 932XL 超大号黑彩墨盒套装墨盒 （含1支黑，3支彩，购买时彩色显示为附件）  
 71 | 商品价格：577.00                              
 72 |                                             
 73 | 商品ID：1119454
 74 | 商品名称：嘉速（Jiasu） 尼康D3300 单反相机专用 高透防刮屏幕保护膜/贴膜  
 75 | 商品价格：18.90                              
 76 |                                             
 77 | 商品ID：1119455
 78 | 商品名称：麦富迪宠物零食 纯天然鸡胸肉卷牛皮狗咬胶200g*2袋  
 79 | 商品价格：69.00                              
 80 |                                             
 81 | 商品ID：1119456
 82 | 商品名称：麦富迪 狗粮 金毛专用成犬粮10kg  
 83 | 商品价格：
 84 | '''
 85 | 
 86 | 
 87 | '''
 88 | 将价格结果存入Excel
 89 | '''
 90 | # 生成a,b两个jdid之间的List
 91 | def get_jd_price_list(a, b):
 92 |     jd_list = []
 93 |     for x in range(a, b):
 94 |         jd_list.append(get_jd_price(x))
 95 |     return jd_list 
 96 | 
 97 | # 生成a,b两个jdid之间的Excel
 98 | def get_jd_price_excel(a,b):
 99 |     wbk = xlwt.Workbook(encoding = 'utf-8')
100 |     sheet = wbk.add_sheet('sheet 1', cell_overwrite_ok=True)
101 |     sheet.write(0,0,'number')
102 |     sheet.write(0,1,'jdid')
103 |     sheet.write(0,2,'jdname')
104 |     sheet.write(0,3,'jdprice')
105 |     jd_lists = get_jd_price_list(a,b)
106 |     for x in range(len(jd_lists)):
107 |         jdGoods = jd_lists[x]
108 |         sheet.write(x+1, 0, x+1)
109 |         sheet.write(x+1, 1, jdGoods.jdid)
110 |         sheet.write(x+1, 2, jdGoods.jdname)
111 |         sheet.write(x+1, 3, jdGoods.jdprice)
112 |     wbk.save('result//06-JDprice//jdprice'+str(a)+'-'+str(b)+'.xls')
113 |     print('生成 Excel 成功！')
114 | 
115 | 
116 | if __name__ == '__main__':
117 |     get_jd_price_excel(1119450,1119550)
118 |     


--------------------------------------------------------------------------------
/21-DoubanMovieTypeTop.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 按类别爬取豆瓣评分最高的电影(使用Beautiful Soup)
  3 | 
  4 | '''
  5 | 
  6 | import urllib.request
  7 | from bs4 import BeautifulSoup
  8 | import re,json
  9 | import xlwt
 10 | 
 11 | 
 12 | DOUBAN_MOVIE = 'http://movie.douban.com'
 13 | 
 14 | class Movie(object):
 15 |     rank = '0'
 16 |     title = ''
 17 |     regions = ''
 18 |     types = ''
 19 |     url = ''
 20 |     release_date = ''
 21 |     score = ''
 22 | 
 23 | class MovieType(object):
 24 |     type_name = ''
 25 |     type_url = ''
 26 |     type_num = ''
 27 | 
 28 | def get_movie_type():
 29 |     movieTypeList = []
 30 |     url = DOUBAN_MOVIE + '/chart'
 31 |     html = urllib.request.urlopen(url).read().decode().encode('gbk','ignore').decode('gbk')
 32 |     soup = BeautifulSoup(html, "lxml")
 33 |     types = soup.find('div', class_='types')
 34 |     soup_type = BeautifulSoup(repr(types), "lxml")
 35 |     url_types = soup_type.find_all('a')
 36 |     for url_type in url_types:
 37 |         movieType = MovieType()
 38 |         url_soup = BeautifulSoup(repr(url_type), "lxml")
 39 |         movieType.type_name = url_soup.find('a').string
 40 |         movieType.type_url = DOUBAN_MOVIE + url_soup.a['href']
 41 |         movieType.type_num = re.search(r'type=(\d+)&', movieType.type_url).group(1)
 42 |         movieTypeList.append(movieType)
 43 |     return movieTypeList
 44 | # http://movie.douban.com/j/chart/top_list?type=3&interval_id=100%3A90&action=&start=40&limit=20
 45 | 
 46 | def get_typeTopMovie(typenum):
 47 |     movieList = []
 48 |     for i in range(20):
 49 |         url = DOUBAN_MOVIE + '/j/chart/top_list?type='+typenum+'&interval_id=100%3A90&action=&start='+str(i*20)+'&limit=20'
 50 |         html = urllib.request.urlopen(url).read().decode().encode('gbk','ignore').decode('gbk')
 51 |         if html == '[]':
 52 |             break
 53 |         jsons = json.loads(html)
 54 |         for x in range(len(jsons)):
 55 |             movie = Movie()
 56 |             js = jsons[x]
 57 |             movie.rank = js['rank']
 58 |             movie.title = js['title']
 59 |             movie.regions = js['regions'][0]
 60 |             movie.url = js['url']
 61 |             movie.release_date = js['release_date']
 62 |             movie.score = js['score']
 63 |             movie.vote_count = js['vote_count']
 64 |             print(movie.rank,movie.score,movie.title)
 65 |             movieList.append(movie)
 66 |     return movieList
 67 | 
 68 | 
 69 | def get_TopMovie_excel():
 70 |     wbk = xlwt.Workbook()
 71 |     movieTypes = get_movie_type()
 72 |     for x in range(len(movieTypes)):
 73 | #   for x in range(2):
 74 |         movieType = movieTypes[x]
 75 |         sheet = wbk.add_sheet(movieType.type_name)
 76 |         sheet.write(0,0,'排序')
 77 |         sheet.write(0,1,'电影名')
 78 |         sheet.write(0,2,'评分')
 79 |         sheet.write(0,3,'评分人数')
 80 |         sheet.write(0,4,'上映国家')
 81 |         sheet.write(0,5,'上映日期')
 82 |         sheet.write(0,6,'页面URL')
 83 |         movie_lists = get_typeTopMovie(movieType.type_num)
 84 |         for x in range(len(movie_lists)):
 85 |             movie = movie_lists[x]
 86 |             sheet.write(x+1, 0, movie.rank)
 87 |             sheet.write(x+1, 1, movie.title)
 88 |             sheet.write(x+1, 2, movie.score)
 89 |             sheet.write(x+1, 3, movie.vote_count)
 90 |             sheet.write(x+1, 4, movie.regions)
 91 |             sheet.write(x+1, 5, movie.release_date)
 92 |             sheet.write(x+1, 6, movie.url)
 93 |         print('生成 %s 类型电影成功！' % movieType.type_name)
 94 |     wbk.save('result//21-DoubanMovieTypeTop//DoubanMovieTypeTop.xls')
 95 |     print('生成 Excel 成功！')
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     #get_movie_type()
100 |     #get_typeTopMovie('3')
101 |     get_TopMovie_excel()


--------------------------------------------------------------------------------
/04-Login.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | http://www.yiibai.com/python/python3-webbug-series4.html
  3 | '''
  4 | 
  5 | '''
  6 | 我们用 Python 来登录网站, 用Cookies记录登录信息, 然后就可以抓取登录之后才能看到的信息. 
  7 | 
  8 | '''
  9 | '''
 10 | 第一步: 使用 Fiddler 观察浏览器行为
 11 | 
 12 | 
 13 | '''
 14 | import urllib.request
 15 | 
 16 | # 写一个 GET 程序, 把知乎首页 GET 下来
 17 | if __name__ != '__main__':
 18 |     url = 'https://www.zhihu.com'
 19 |     data = urllib.request.urlopen(url).read()
 20 |     print(data)
 21 | 
 22 | '''
 23 | 解压缩
 24 | 
 25 | 知乎网传给我们的是经过 gzip 压缩之后的数据. 这样我们就需要先对数据解压. 
 26 | 
 27 | Python 进行 gzip 解压很方便, 因为内置有库可以用. 代码片段如下:
 28 | '''
 29 | import gzip
 30 | 
 31 | def ungzip(data):
 32 |     try:        # 尝试解压
 33 |         print('正在解压...')
 34 |         data = gzip.decompress(data)
 35 |         print('解压完毕！')
 36 |     except:
 37 |         print('无需解压！')
 38 |     return data
 39 | 
 40 | if __name__ != '__main__':
 41 |     url = 'https://www.zhihu.com'
 42 |     data = urllib.request.urlopen(url).read()
 43 |     print(ungzip(data))
 44 | 
 45 | 
 46 | '''
 47 | 使用正则表达式获取 lt 和 execution 的值.
 48 | 
 49 | 我们在第一遍 GET 的时候可以从响应报文中的 HTML 代码里面得到这个 lt 和 execution 的值. 
 50 | 如下函数实现了这个功能, 返回的 str 就是 lt 和 execution 的值.
 51 | '''
 52 | import re
 53 | 
 54 | def getLT(data):
 55 |     cer = re.compile('name=\"lt\" value=\"(.*)\"', flags = 0)
 56 |     strlist = cer.findall(data)
 57 |     return strlist[0]
 58 | 
 59 | def getExecution(data):
 60 |     cer = re.compile('name=\"execution\" value=\"(.*)\"', flags = 0)
 61 |     strlist = cer.findall(data)
 62 |     return strlist[0]
 63 | '''
 64 | 发射 POST !!
 65 | 
 66 | 集齐 _xsrf, id, password 三大法宝, 我们可以发射 POST 了. 
 67 | 这个 POST 一旦发射过去, 我们就登陆上了服务器, 服务器就会发给我们 Cookies. 
 68 | 本来处理 Cookies 是个麻烦的事情, 不过 Python 的 http.cookiejar 库给了我们很方便的解决方案, 
 69 | 只要在创建 opener 的时候将一个 HTTPCookieProcessor 放进去, Cookies 的事情就不用我们管了. 
 70 | '''
 71 | import http.cookiejar
 72 | import urllib.request
 73 | 
 74 | '''
 75 | getOpener 函数接收一个 head 参数, 这个参数是一个字典. 
 76 | 函数把字典转换成元组集合, 放进 opener. 
 77 | 
 78 | 这样我们建立的这个 opener 就有两大功能:
 79 | 1、自动处理使用 opener 过程中遇到的 Cookies
 80 | 2、自动在发出的 GET 或者 POST 请求中加上自定义的 Header
 81 | '''
 82 | def getOpener(head):
 83 |     cj = http.cookiejar.CookieJar()
 84 |     pro = urllib.request.HTTPCookieProcessor(cj)
 85 |     opener = urllib.request.build_opener(pro)
 86 |     header = []
 87 |     for key, value in head.items():
 88 |         elem = (key, value)
 89 |         header.append(elem)
 90 |     opener.addheaders = header
 91 |     return opener
 92 | 
 93 | 
 94 | '''
 95 | 第四部: 正式运行
 96 | 
 97 | 我们要把要 POST 的数据弄成 opener.open() 支持的格式. 所以还要  urllib.parse 库里的 urlencode() 函数.
 98 | 这个函数可以把 字典 或者 元组集合 类型的数据转换成 & 连接的 str.
 99 | str 还不行, 还要通过 encode() 来编码, 才能当作 opener.open() 或者 urlopen() 的 POST 数据参数来使用.
100 | '''
101 | header = {
102 |     'Connection': 'Keep-Alive',
103 |     'Accept': 'text/html, application/xhtml+xml, */*',
104 |     'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
105 |     'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
106 |     'Accept-Encoding': 'gzip, deflate',
107 |     'Host': 'www.zhihu.com',
108 |     'DNT': '1'
109 | }
110 | 
111 | url ='https://passport.csdn.net'
112 | opener = getOpener(header)
113 | op = opener.open(url)
114 | data = op.read()
115 | data = ungzip(data)
116 | 
117 | 
118 | lt = getLT(data.decode())
119 | print(lt)
120 | execution = getExecution(data.decode())
121 | print(execution)
122 | 
123 | url += '/account/login'
124 | id = '921550356@qq.com'
125 | password = 'xxxxxxx'
126 | postDict = {
127 |     'ref':'toolbar',
128 |     'username':id,
129 |     'password':password,
130 |     'lt':lt,
131 |     'execution':execution,
132 |     '_eventId':'submit'
133 | }
134 | 
135 | postData = urllib.parse.urlencode(postDict).encode()
136 | print('postData',postData)
137 | op = opener.open(url,postData)
138 | print('op',op)
139 | data = op.read()
140 | data = ungzip(data)
141 | 
142 | 
143 | 
144 | returnStr =  data.decode()
145 | try:
146 |     returnDict = eval(returnStr) # 字符串转 dict
147 |     for d,x in returnDict.items():
148 |         print('字典代码：%s，字典值：%s' % (d,x))
149 | except :
150 |     pass
151 | 
152 | print(returnStr)
153 | 
154 | 


--------------------------------------------------------------------------------
/20-DoubanMovieTop250.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 爬取豆瓣评分最高的250部电影(使用Beautiful Soup)
  3 | 
  4 | '''
  5 | 
  6 | import urllib.request
  7 | from bs4 import BeautifulSoup
  8 | import xlwt
  9 | 
 10 | MOVIE_TOP250 = 'http://movie.douban.com/top250?start='
 11 | 
 12 | class Movie(object):
 13 |     id = '0'
 14 |     title = ''
 15 |     url = ''
 16 |     year = ''
 17 |     director = ''
 18 |     peoples = ''
 19 |     rating = ''
 20 | 
 21 |         
 22 | 
 23 | def get_douban_movie(url,title=''):
 24 |     try:
 25 |         html = urllib.request.urlopen(url).read().decode().encode('gbk','ignore').decode('gbk')
 26 |         soup = BeautifulSoup(html, "lxml")
 27 |         title = soup.find('span', property='v:itemreviewed').string
 28 |         year = soup.find('span', class_='year').string
 29 |         peoples = soup.find('span', property='v:votes').string
 30 |         rating = soup.find('strong', class_='ll rating_num').string
 31 |         director = soup.find('a', rel='v:directedBy').string
 32 |         stars = soup.find_all('span', class_='rating_per')
 33 |         return {'title':title,'year':year,'director':director,'peoples':peoples,'rating':rating,'star5':stars[0].string,'star4':stars[1].string,'star3':stars[2].string,'star2':stars[3].string,'star1':stars[4].string}
 34 |     except urllib.error.HTTPError as e:
 35 |         pass#print('没有该页面：%s' % id)
 36 |     except IndexError as e:
 37 |         pass#print('字符串超长')
 38 |     except UnboundLocalError as e:
 39 |         pass
 40 |     except Exception as e:
 41 |         raise e
 42 |     return {'title':title+'[页面失效]','year':'(0000)','director':'','peoples':'0','rating':'0.0','star5':'','star4':'','star3':'','star2':'','star1':''}
 43 | 
 44 | 
 45 | def get_movie_list():
 46 |     movie_list = []
 47 |     for i in range(10):
 48 |         top250html = urllib.request.urlopen(MOVIE_TOP250+str(i*25)).read().decode().encode('gbk','ignore').decode('gbk')
 49 |         soup = BeautifulSoup(top250html, "lxml")
 50 |         itemlist = soup.find_all('div', class_='pic')
 51 |         for i in range(len(itemlist)):
 52 |             soupitem = BeautifulSoup(repr(itemlist[i]), "lxml")
 53 |             num = soupitem.find('em').string
 54 |             url = soupitem.a['href']
 55 |             title = soupitem.img['alt']
 56 |             movie = get_douban_movie(url,title)
 57 |             print('%3s，%6s，%6s，%3s，%-50s' % (num,movie.get('year'),movie.get('peoples'),movie.get('rating'),movie.get('title')))
 58 |             movie_list.append(movie)
 59 |     return movie_list 
 60 | 
 61 | def get_movie_top250():
 62 |     movie_list = []
 63 |     for i in range(10):
 64 |         top250html = urllib.request.urlopen(MOVIE_TOP250+str(i*25)).read().decode().encode('gbk','ignore').decode('gbk')
 65 |         soup = BeautifulSoup(top250html, "lxml")
 66 |         itemlist = soup.find_all('div', class_='pic')
 67 |         for i in range(len(itemlist)):
 68 |             movie = Movie()
 69 |             soupitem = BeautifulSoup(repr(itemlist[i]), "lxml")
 70 |             movie.id = soupitem.find('em').string
 71 |             movie.url = soupitem.a['href']
 72 |             movie.title = soupitem.img['alt']
 73 |             movie.rating = soup.find('span', class_='rating_num').string
 74 |             movie_list.append(movie)
 75 |             print(movie.id,movie.rating,movie.title,movie.url)
 76 |     return movie_list
 77 | 
 78 | # 生成a,b两个jdid之间的Excel
 79 | def get_movie_list_excel():
 80 |     wbk = xlwt.Workbook()
 81 |     sheet = wbk.add_sheet('sheet 1')
 82 |     sheet.write(0,0,'排序')
 83 |     sheet.write(0,1,'电影名')
 84 |     sheet.write(0,2,'出品年份')
 85 |     sheet.write(0,3,'导演')
 86 |     sheet.write(0,4,'评分人数')
 87 |     sheet.write(0,5,'总评分')
 88 |     sheet.write(0,6,'五星占比')
 89 |     sheet.write(0,7,'四星占比')
 90 |     sheet.write(0,8,'三星占比')
 91 |     sheet.write(0,9,'二星占比')
 92 |     sheet.write(0,10,'一星占比')
 93 |     movie_lists = get_movie_list()
 94 |     for x in range(len(movie_lists)):
 95 |         sheet.write(x+1, 0, x+1)
 96 |         sheet.write(x+1, 1, movie_lists[x].get('title'))
 97 |         sheet.write(x+1, 2, movie_lists[x].get('year'))
 98 |         sheet.write(x+1, 3, movie_lists[x].get('director'))
 99 |         sheet.write(x+1, 4, movie_lists[x].get('peoples'))
100 |         sheet.write(x+1, 5, movie_lists[x].get('rating'))
101 |         sheet.write(x+1, 6, movie_lists[x].get('star5'))
102 |         sheet.write(x+1, 7, movie_lists[x].get('star4'))
103 |         sheet.write(x+1, 8, movie_lists[x].get('star3'))
104 |         sheet.write(x+1, 9, movie_lists[x].get('star2'))
105 |         sheet.write(x+1, 10, movie_lists[x].get('star1'))
106 |     wbk.save('result//20-DoubanMovieTop250//DoubanMovieTop250.xls')
107 |     print('生成 Excel 成功！')
108 | 
109 | 
110 | if __name__ != '__main__':
111 |     get_movie_list_excel()
112 |     
113 | if __name__ == '__main__':
114 |     get_movie_top250()


--------------------------------------------------------------------------------
/22-PyQuery.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Python中PyQuery库的使用总结
  3 | 
  4 | 
  5 | pyquery库是jQuery的Python实现，可以用于解析HTML网页内容
  6 | '''
  7 | 
  8 | from pyquery import PyQuery as pq
  9 | 
 10 | 
 11 | print('----1.可加载一段HTML字符串，或一个HTML文件，或是一个url地址----')
 12 | d1 = pq('<html><title>hello</title></html>')
 13 | d2 = pq(url='http://movie.douban.com/subject/1309069/')
 14 | #d3 = pq(filename='result\\02-tieba\\16pn=1.html')
 15 | 
 16 | 
 17 | print('----2.html()和text() ——获取相应的HTML块或文本块----')
 18 | d1 = pq('<html><title>hello</title></html>')
 19 | print(d1('html'))           # <html><title>hello</title></html>
 20 | print(d1('html').html())    # <title>hello</title>
 21 | print(d1('html').text())    # hello
 22 | 
 23 | 
 24 | print('----3.根据HTML标签来获取元素----')
 25 | d=pq('<div><p>test 1</p><p>test 2</p></div>')
 26 | print(d('p'))               # <p>test 1</p><p>test 2</p>
 27 | print(d('p').html())        # test 1
 28 | 
 29 | 
 30 | print('----4.eq(index) ——根据给定的索引号得到指定元素----')
 31 | d=pq('<div><p>test 1</p><p>test 2</p></div>')
 32 | print(d('p').html())
 33 | print(d('p').eq(0).html())  # test 1
 34 | print(d('p').eq(1).html())  # test 2
 35 | print(d('p').eq(2).html())  # None
 36 | 
 37 | 
 38 | print('----5.filter() ——根据类名、id名得到指定元素----')
 39 | d=pq("<div><p id='one'>test 1</p><p class='two'>test 2</p></div>")
 40 | print(d('p').filter('#one'))  # <p id="one">test 1</p>
 41 | print(d('p').filter('.two'))  # <p class="two">test 2</p>
 42 | 
 43 | 
 44 | print('----6.find() ——查找嵌套元素----')
 45 | d=pq("<div><p id='one'>test 1</p><p class='two'>test 2</p></div>")
 46 | print(d('div').find('p'))       # <p id="one">test 1</p><p class="two">test 2</p>
 47 | print(d('div').find('p').eq(0)) # <p id="one">test 1</p>
 48 | 
 49 | 
 50 | print('----7.直接根据类名、id名获取元素----')
 51 | d=pq("<div><p id='one'>test 1</p><p class='two'>test 2</p></div>")
 52 | print(d('#one').html())     # test 1
 53 | print(d('.two').html())     # test 2
 54 | 
 55 | 
 56 | print('----8.获取属性值----')
 57 | d=pq("<p id='my_id'><a href='http://hello.com'>hello</a></p>")
 58 | print(d('a').attr('href'))  # http://hello.com
 59 | print(d('p').attr('id'))    # my_id
 60 | 
 61 | print('----9.修改属性值----')
 62 | d=pq("<p id='my_id'><a href='http://hello.com'>hello</a></p>")
 63 | print(d)        # <p id="my_id"><a href="http://hello.com">hello</a></p>
 64 | d('a').attr('href','http://baidu.com')
 65 | print(d)        # <p id="my_id"><a href="http://baidu.com">hello</a></p>
 66 | 
 67 | 
 68 | print('----10.addClass(value) ——为元素添加类----')
 69 | d=pq('<div></div>')
 70 | print(d)                # <div/>
 71 | d.addClass('my_class')
 72 | print(d)                # <div class="my_class"/>
 73 | 
 74 | 
 75 | print('----11.hasClass(name) #返回判断元素是否包含给定的类----')
 76 | d=pq("<div class='my_class'></div>")
 77 | print(d.hasClass('my_class'))   # True
 78 | print(d.hasClass('my_c'))       # False
 79 | 
 80 | 
 81 | print('----12.children(selector=None) ——获取子元素----')
 82 | d=pq("<span><p id='one'>hello</p><p id='two'>world</p></span>")
 83 | print(d.children())             # <p id="one">hello</p><p id="two">world</p>
 84 | print(d.children('#two'))       # <p id="two">world</p>
 85 | 
 86 | 
 87 | print('----13.parents(selector=None)——获取父元素----')
 88 | d=pq("<span><p id='one'>hello</p><p id='two'>world</p></span>")
 89 | print(d('p').parents())         # <span><p id="one">hello</p><p id="two">world</p></span>
 90 | print(d('#one').parents('span'))# <span><p id="one">hello</p><p id="two">world</p></span>
 91 | print(d('#one').parents('p'))   # []
 92 | 
 93 | 
 94 | print('----14.clone() ——返回一个节点的拷贝----')
 95 | d=pq("<span><p id='one'>hello</p><p id='two'>world</p></span>")
 96 | print(d('#one'))                # <p id="one">hello</p>
 97 | print(d('#one').clone())        # <p id="one">hello</p>
 98 | 
 99 | 
100 | print('----15.empty() ——移除节点内容----')
101 | d=pq("<span><p id='one'>hello</p><p id='two'>world</p></span>")
102 | print(d)            # <span><p id="one">hello</p><p id="two">world</p></span>
103 | d('#one').empty()
104 | print(d)            # <span><p id="one"/><p id="two">world</p></span>
105 | 
106 | 
107 | print('----16.nextAll(selector=None) ——返回后面全部的元素块----')
108 | d=pq("<p id='one'>hello</p><p id='two'>world</p><img scr='' />")
109 | print(d('p:first').nextAll())   # <p id="two">world</p><img scr=""/>
110 | print(d('p:last').nextAll())    # <img scr=""/>
111 | 
112 | print('----17.not_(selector) ——返回不匹配选择器的元素----')
113 | d=pq("<span><p id='one'>hello</p><p id='two'>world</p></span>")
114 | print(d('p').not_('#two'))      # <p id="one">hello</p>
115 | 
116 | 
117 | 
118 | '''
119 | 爬取豆瓣电影页面中主演
120 | '''
121 | if __name__ == '__main__':
122 |     print('----爬取豆瓣电影页面中主演----')
123 |     # 读取Batman Begins页面  
124 |     doc = pq(url='http://movie.douban.com/subject/3077412/')
125 |     # 遍历starring节点  
126 |     starring = doc("a[rel='v:starring']")
127 |     # 转化为Map
128 |     stars = starring.map(lambda i,e:pq(e).text())
129 |     print('<<%s>>的主演：' % (doc("span[property='v:itemreviewed']").text()))
130 |     for i in stars:
131 |         print(i)
132 | '''
133 | 执行结果：
134 | 
135 | ----爬取豆瓣电影页面中主演----
136 | <<寻龙诀>>的主演：
137 | 陈坤
138 | 黄渤
139 | 舒淇
140 | 杨颖
141 | 夏雨
142 | 刘晓庆
143 | 颜卓灵
144 | 曹操
145 | 张东
146 | 黄西
147 | 僧格仁钦
148 | '''


--------------------------------------------------------------------------------
/11-CSDNBlogList.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 根据用户名，获取该用户的CSDN的博客列表
 3 | '''
 4 | 
 5 | import urllib.request
 6 | import re
 7 | 
 8 | CSDN_URL = 'http://blog.csdn.net'
 9 | 
10 | # 获取主页网址
11 | def get_blog_url(bloger):
12 |     return 'http://blog.csdn.net/'+bloger+'/article/list'
13 | 
14 | # 根据网址获取HTML
15 | def get_blog_html(url):
16 |     headers = {
17 |         'Connection': 'Keep-Alive',
18 |         'Accept': 'text/html, application/xhtml+xml, */*',
19 |         'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
20 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
21 |     }
22 |     req = urllib.request.Request(url, headers = headers)
23 |     html = urllib.request.urlopen(req).read().decode()
24 |     return html
25 | 
26 | # 获取页码
27 | def get_page_num(html):
28 |     page = re.search(r'共(\d+)页', html).group(1)
29 |     return page
30 | 
31 | # 获取博客发表时间
32 | def get_blog_time(html):
33 |     blogtime = re.search(r'<span class="link_postdate">(.*?)</span>', html).group(1)
34 |     return blogtime
35 | 
36 | # 获取博客阅读次数
37 | def get_blog_reader(html):
38 |     reader = re.search(r'<span class="link_view".*?>(.*?)</span>', html).group(1)
39 |     return reader
40 | 
41 | # 获取页面列表
42 | def get_blog_list(html):
43 |     content = re.search(r'<span class="link_title"><a href="(.*?)">\s*([^\s]*)\s*</a></span>', html)
44 |     for match in re.finditer(r'<span class="link_title"><a href="(.*?)">\s*([^\s]*)\s*</a></span>', html):
45 |         link = match.group(1)
46 |         title = match.group(2)
47 |         blogurl = CSDN_URL + link
48 |         bloghtml = get_blog_html(blogurl)
49 |         blogtime = get_blog_time(bloghtml)
50 |         blogreader = get_blog_reader(bloghtml)
51 |         print('%s %s %+10s %-50s' % (link,blogtime,blogreader,title))
52 | 
53 | if __name__ == '__main__':
54 |     bloger = 'oYunTaoLianWu'
55 |     blogurl = get_blog_url(bloger)
56 |     html = get_blog_html(blogurl)
57 |     page = get_page_num(html)
58 |     for x in range(int(page)):
59 |         pageurl = blogurl + '/' + str(x+1)
60 |         print('第%s页的博客目录如下(%s)：' % (x+1,pageurl))
61 |         html = get_blog_html(pageurl)
62 |         get_blog_list(html)
63 | 
64 | 
65 | '''
66 | 运行结果：
67 | 
68 | 第1页的博客目录如下(http://blog.csdn.net/oYunTaoLianWu/article/list/1)：
69 | /jueblog/article/details/33700479 2014-06-23 09:07     916人阅读 notepad++列块编辑操作                                   
70 | /jueblog/article/details/26486821 2014-05-21 17:17     891人阅读 【Chrome】Chrome插件开发（一）插件的简单实现                      
71 | /jueblog/article/details/17465225 2013-12-21 13:40    3137人阅读 【Java】实现按中文首字母排序                                  
72 | /jueblog/article/details/16972635 2013-11-26 22:04    7031人阅读 【实用技术】WIN7系统下U盘安装了ubuntu13.04双系统                  
73 | /jueblog/article/details/16103925 2013-11-13 22:04    1171人阅读 【Android】Android蓝牙开发深入解析                          
74 | /jueblog/article/details/15013635 2013-11-09 23:27    2038人阅读 【Android】App自动更新之通知栏下载                            
75 | /jueblog/article/details/14600521 2013-11-08 23:27    2491人阅读 【Android】网络图片加载优化（一）利用弱引用缓存异步加载                   
76 | /jueblog/article/details/14497181 2013-11-07 22:43    8588人阅读 【Android】第三方QQ账号登录的实现                             
77 | /jueblog/article/details/13434551 2013-10-29 00:26    6374人阅读 【Java】内部类与外部类的互访使用小结                              
78 | /jueblog/article/details/13164349 2013-10-27 02:08    8249人阅读 【Android】PULL解析XML文件                              
79 | /jueblog/article/details/12985045 2013-10-24 01:06   16847人阅读 【Android】Web开发之使用WebView控件展示Web页面                 
80 | 第2页的博客目录如下(http://blog.csdn.net/oYunTaoLianWu/article/list/2)：
81 | /jueblog/article/details/12984417 2013-10-24 00:50    1396人阅读 【Android】Wifi管理与应用                                
82 | /jueblog/article/details/12983821 2013-10-24 00:38    1670人阅读 【Android】Web开发之通知栏下载更新APP                         
83 | /jueblog/article/details/12958737 2013-10-23 00:54    2754人阅读 【Android】Web开发之显示网络图片的两种方法                        
84 | /jueblog/article/details/12958159 2013-10-23 00:40    2658人阅读 【Android】Web开发之通过Apache接口处理Http请求                 
85 | /jueblog/article/details/12847239 2013-10-18 01:09    3142人阅读 【Android】MediaPlayer使用方法简单介绍                      
86 | /jueblog/article/details/12806909 2013-10-17 00:29    2334人阅读 【Android】Web开发之通过标准Java接口处理Http请求                 
87 | /jueblog/article/details/12764325 2013-10-16 01:23    3012人阅读 【Android】Activity与服务Service绑定                     
88 | /jueblog/article/details/12721651 2013-10-15 00:43    1681人阅读 【Android】利用服务Service创建标题栏通知                       
89 | /jueblog/article/details/12721555 2013-10-15 00:36    2253人阅读 【Android】利用广播Broadcast接收SMS短信                     
90 | /jueblog/article/details/12691855 2013-10-14 01:07    6796人阅读 【Android】利用广播BroadCast监听网络的变化                     
91 | /jueblog/article/details/12668215 2013-10-13 02:34    2909人阅读 【Android】Activity遮罩效果的实现                          
92 | /jueblog/article/details/12667463 2013-10-13 02:28   12967人阅读 【Android】BroadCast广播机制应用与实例                       
93 | /jueblog/article/details/12655269 2013-10-12 17:41    1182人阅读 【Android】Handler应用（四）：AsyncTask的用法与实例             
94 | /jueblog/article/details/12627403 2013-10-12 00:57    3269人阅读 【Android】Handler应用（三）：从服务器端分页加载更新ListView     
95 | '''


--------------------------------------------------------------------------------
/03-Chrome.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | http://www.yiibai.com/python/python3-webbug-series3.html
  3 | '''
  4 | '''
  5 | 上一次我自学爬虫的时候, 写了一个简陋的勉强能运行的爬虫alpha. 
  6 | alpha版有很多问题：
  7 | 1、比如一个网站上不了, 爬虫却一直在等待连接返回response, 不知道超时跳过; 
  8 | 2、或者有的网站专门拦截爬虫程序, 我们的爬虫也不会伪装自己成为浏览器正规部队; 
  9 | 3、并且抓取的内容没有保存到本地, 没有什么作用
 10 | '''
 11 | 
 12 | import re
 13 | import urllib.request
 14 | import http.cookiejar
 15 | import urllib
 16 | from collections import deque
 17 | import datetime
 18 | 
 19 | 
 20 | '''
 21 | 添加超时跳过功能
 22 | 
 23 | 首先, 我简单地将
 24 | urlop = urllib.request.urlopen(url)
 25 | 改为
 26 | urlop = urllib.request.urlopen(url, timeout = 2)
 27 | 
 28 | 运行后发现, 当发生超时, 程序因为exception中断
 29 | 
 30 | 于是我把这一句也放在try .. except 结构里, 问题解决.
 31 | '''
 32 | 
 33 | '''
 34 | 支持自动跳转
 35 | 
 36 | 在爬 http://baidu.com 的时候, 爬回来一个没有什么内容的东西, 这个东西告诉我们应该跳转到 http://www.baidu.com . 
 37 | 但是我们的爬虫并不支持自动跳转, 现在我们来加上这个功能, 让爬虫在爬 baidu.com 的时候能够抓取 www.baidu.com 的内容.
 38 | 
 39 | 首先我们要知道爬 http://baidu.com 的时候他返回的页面是怎么样的, 这个我们既可以用 Fiddler 看, 也可以写一个小爬虫来抓取. 
 40 | b'<html>\n
 41 |     <meta http-equiv="refresh" content="0;url=http://www.baidu.com/">\n
 42 |   </html>\n'
 43 | 利用 html 的 meta 来刷新与重定向的代码, 其中的0是等待0秒后跳转, 也就是立即跳转. 
 44 | '''
 45 | 
 46 | '''
 47 | 伪装浏览器正规军
 48 | 
 49 | 现在详细研究一下如何让网站们把我们的Python爬虫当成正规的浏览器来访. 
 50 | 因为如果不这么伪装自己, 有的网站就爬不回来了. 
 51 | 如果看过理论方面的知识, 就知道我们是要在 GET 的时候将 User-Agent 添加到header里.
 52 | 
 53 | 在 GET 的时候添加 header 有很多方法, 下面介绍两种方法.
 54 | '''
 55 | 
 56 | '''
 57 | 第一种方法比较简便直接, 但是不好扩展功能, 代码如下:
 58 | 
 59 | '''
 60 | if __name__ != '__main__':
 61 |     url = 'http://www.baidu.com/'
 62 |     req = urllib.request.Request(url, headers = {
 63 |         'Connection': 'Keep-Alive',
 64 |         'Accept': 'text/html, application/xhtml+xml, */*',
 65 |         'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
 66 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
 67 |     })
 68 |     oper = urllib.request.urlopen(req)
 69 |     data = oper.read()
 70 |     print(data)    
 71 | 
 72 | '''
 73 | 第二种方法使用了 build_opener 这个方法, 用来自定义 opener, 这种方法的好处是可以方便的拓展功能.
 74 | 例如下面的代码就拓展了自动处理 Cookies 的功能.
 75 | '''
 76 | if __name__ != '__main__':
 77 |     # head: dict of header
 78 |     def makeMyOpener(head = {
 79 |         'Connection': 'Keep-Alive',
 80 |         'Accept': 'text/html, application/xhtml+xml, */*',
 81 |         'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
 82 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
 83 |     }):
 84 |         cj = http.cookiejar.CookieJar()
 85 |         opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
 86 |         header = []
 87 |         for key, value in head.items():
 88 |             elem = (key, value)
 89 |             header.append(elem)
 90 |         opener.addheaders = header
 91 |         return opener
 92 |      
 93 |     oper = makeMyOpener()
 94 |     uop = oper.open('http://www.baidu.com/', timeout = 1000)
 95 |     data = uop.read()
 96 |     print(data)
 97 | 
 98 | 
 99 | '''
100 | 
101 | '''
102 | if __name__!='__main__':
103 | 
104 |     data = urllib.request.urlopen('http://baidu.com').read()
105 |     print(data)
106 | 
107 | 
108 | '''
109 | 保存抓回来的报文
110 | 
111 | Python 的文件操作还是相当方便的. 
112 | 我们可以讲抓回来的数据 data 以二进制形式保存, 也可以经过 decode() 处理成为字符串后以文本形式保存. 
113 | 改动一下打开文件的方式就能用不同的姿势保存文件了. 
114 | 下面是参考代码:
115 | '''
116 | def get_log(data,fileName='study'):
117 |     save_path = 'result\\01-Chrome\\'+fileName+'.txt'
118 |     f_obj = open(save_path, 'a+') # wb 表示打开方式  a 表示追加
119 |     f_obj.write(data)
120 |     f_obj.close()
121 | 
122 |     
123 |     
124 | def get_url_deque(url):
125 |     fileName = re.search(r'://(.*)', url).group(1).replace('/','-')
126 |     queue = deque()
127 |     visited = set()
128 |     get_log('开始抓取：%s\n' % url,fileName)
129 |     starttime = datetime.datetime.now()
130 |     queue.append(url)
131 |     cnt = 0
132 |     
133 |     while queue:
134 |         url = queue.popleft()   # 队首元素出队
135 |         visited |= {url}        # 标记为已访问
136 |     
137 |         get_log('已经抓取：'+str(cnt)+'正在抓取<---'+url + '\n',fileName)
138 |     
139 |         req = urllib.request.Request(url)
140 |         req.add_header('User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25')
141 |     
142 |         cnt += 1
143 |         try:
144 |             urlop = urllib.request.urlopen(req, timeout = 2000)
145 |         except:
146 |             continue
147 |         
148 |         # 用getheader()函数来获取抓取到的文件类型, 是html再继续分析其中的链接
149 |         if 'html' not in urlop.getheader('Content-Type'):
150 |             continue
151 |         
152 |         # 避免程序异常中止, 用try..catch处理异常
153 |         try:
154 |             data = urlop.read().decode('utf-8')
155 |         except:
156 |             continue
157 |     
158 |         # 正则表达式提取页面中所有队列, 并判断是否已经访问过, 然后加入待爬队列
159 |         linkre = re.compile('href=\"(.+?)\"')
160 |         for x in linkre.findall(data):
161 |             if 'http' in x and x not in visited:
162 |                 queue.append(x)
163 |                 get_log('加入队列 --->  ' + x + '\n',fileName)
164 |     endtime = datetime.datetime.now()
165 |     get_log('抓取完毕！共耗时：%s.seconds \n' % (endtime - starttime),fileName)
166 | 
167 | if __name__=='__main__':
168 |     url = 'https://github.com/Jueee/04-LiaoXueFeng'  # 入口页面, 可以换成别的
169 |     get_url_deque(url)
170 |     
171 |     


--------------------------------------------------------------------------------
/19-BeautifulSoup.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Beautiful Soup(python3中的爬虫匹配神器)
  3 | 
  4 | 参考阅读：Beautiful Soup 中文文档
  5 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
  6 | '''
  7 | '''
  8 | Beautiful Soup 是用Python写的一个HTML/XML的解析器，它可以很好的处理不规范标记并生成剖析树(parse tree)。 
  9 | 它提供简单又常用的导航（navigating），搜索以及修改剖析树的操作。它可以大大节省你的编程时间。 
 10 | '''
 11 | from bs4 import BeautifulSoup
 12 | 
 13 | html = """
 14 | <html><head><title>The Dormouse's story</title></head>
 15 | <body>
 16 | <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
 17 | <p class="story">Once upon a time there were three little sisters; and their names were
 18 | <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
 19 | <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
 20 | <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
 21 | and they lived at the bottom of a well.</p>
 22 | <p class="story">...</p>
 23 | """
 24 | # 得到一个 BeautifulSoup 的对象
 25 | soup = BeautifulSoup(html)
 26 | # 按照标准的缩进格式的结构输出
 27 | print(soup.prettify())
 28 | 
 29 | '''
 30 | # 简单的浏览结构化数据的方法:
 31 | '''
 32 | print(soup.title)
 33 | # <title>The Dormouse's story</title>
 34 | print(soup.title.name)
 35 | # title
 36 | print(soup.title.string)
 37 | # The Dormouse's story
 38 | print(soup.title.parent.name)
 39 | # head
 40 | print(soup.p)
 41 | # <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
 42 | print(soup.p['class'])
 43 | # ['title']
 44 | print(soup.a)
 45 | # <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>
 46 | print(soup.find_all('a'))
 47 | # [<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
 48 | print(soup.find(id='link3'))
 49 | # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
 50 | 
 51 | # 从文档中找到所有<a>标签的链接:
 52 | for link in soup.find_all('a'):
 53 |     print(link.get('href'))
 54 | '''
 55 | http://example.com/elsie
 56 | http://example.com/lacie
 57 | http://example.com/tillie
 58 | '''
 59 | 
 60 | # 从文档中获取所有文字内容:
 61 | print(soup.get_text())
 62 | 
 63 | 
 64 | '''
 65 | 对象的种类
 66 | 
 67 | Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象.
 68 | 所有对象可以归纳为4种: Tag , NavigableString , BeautifulSoup , Comment .
 69 | '''
 70 | '''
 71 | Tag
 72 | Tag 对象与XML或HTML原生文档中的tag相同:
 73 | '''
 74 | soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
 75 | tag = soup.b
 76 | print(type(tag))
 77 | # <class 'bs4.element.Tag'>
 78 | '''
 79 | Name
 80 | 每个tag都有自己的名字,通过 .name 来获取:
 81 | '''
 82 | print(tag.name)
 83 | # b
 84 | # 如果改变了tag的name,那将影响所有通过当前Beautiful Soup对象生成的HTML文档:
 85 | tag.name = 'blockquote'
 86 | print(tag)
 87 | # <blockquote class="boldest">Extremely bold</blockquote>
 88 | '''
 89 | Attributes
 90 | 一个tag可能有很多个属性. tag <b class="boldest"> 有一个 “class” 的属性,值为 “boldest” .
 91 | tag的属性的操作方法与字典相同:
 92 | '''
 93 | print(tag['class'])
 94 | # ['boldest']
 95 | # 也可以直接”点”取属性, 比如: .attrs :
 96 | print(tag.attrs)
 97 | # {'class': ['boldest']}
 98 | 
 99 | tag['class'] = 'verybold'
100 | tag['id'] = 1
101 | print(tag)
102 | # <blockquote class="verybold" id="1">Extremely bold</blockquote>
103 | del tag['id']
104 | print(tag)
105 | # <blockquote class="verybold">Extremely bold</blockquote>
106 | '''
107 | 多值属性
108 | '''
109 | css_soup = BeautifulSoup('<p class="body strikeout"></p>')
110 | print(css_soup.p['class'])
111 | # ['body', 'strikeout']
112 | # 如果某个属性看起来好像有多个值,但在任何版本的HTML定义中都没有被定义为多值属性,那么Beautiful Soup会将这个属性作为字符串返回
113 | id_soup = BeautifulSoup('<p id="my id"></p>')
114 | print(id_soup.p['id'])
115 | # my id
116 | # 将tag转换成字符串时,多值属性会合并为一个值
117 | rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>')
118 | print(rel_soup.a['rel'])
119 | # ['index']
120 | rel_soup.a['rel'] = ['index', 'contents']
121 | print(rel_soup.p)
122 | # <p>Back to the <a rel="index contents">homepage</a></p>
123 | 
124 | '''
125 | 可以遍历的字符串
126 | 字符串常被包含在tag内.Beautiful Soup用 NavigableString 类来包装tag中的字符串:
127 | '''
128 | print(tag.string)
129 | # Extremely bold
130 | type(tag.string)
131 | # <class 'bs4.element.NavigableString'>
132 | '''
133 | # 通过 unicode() 方法可以直接将 NavigableString 对象转换成Unicode字符串:
134 | unicode_string = unicode(tag.string)
135 | unicode_string
136 | # u'Extremely bold'
137 | type(unicode_string)
138 | # <type 'unicode'>
139 | '''
140 | # tag中包含的字符串不能编辑,但是可以被替换成其它的字符串,用 replace_with() 方法:
141 | tag.string.replace_with("No longer bold")
142 | tag
143 | # <blockquote>No longer bold</blockquote>
144 | 
145 | 
146 | print('---------遍历文档树---------')
147 | 
148 | html = """
149 | <html><head><title>The Dormouse's story</title></head>
150 | <body>
151 | <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
152 | <p class="story">Once upon a time there were three little sisters; and their names were
153 | <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
154 | <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
155 | <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
156 | and they lived at the bottom of a well.</p>
157 | <p class="story">...</p>
158 | """
159 | soup = BeautifulSoup(html)
160 | '''
161 | 遍历文档树
162 | '''
163 | print(soup.head)
164 | print(soup.title)
165 | # 通过点取属性的方式只能获得当前名字的第一个tag:
166 | print(soup.body.b)
167 | # 如果想要得到所有的<a>标签
168 | print(soup.find_all('a'))
169 | 
170 | 
171 | 
172 | # tag的 .contents 属性可以将tag的子节点以列表的方式输出:
173 | head_tag = soup.head
174 | print(head_tag.contents)
175 | print(head_tag.contents[0])
176 | print(head_tag.contents[0].contents)
177 | print(head_tag.contents[0].contents[0])
178 | # 通过tag的 .children 生成器,可以对tag的子节点进行循环:
179 | for child in head_tag.contents[0]:
180 |     print(child)
181 | 
182 | 
183 | # .descendants
184 | print('--------.descendants--------')
185 | # .contents 和 .children 属性仅包含tag的直接子节点.
186 | # .descendants 属性可以对所有tag的子孙节点进行递归循环
187 | for child in head_tag.descendants:
188 |     print(child)
189 | 
190 | # .string
191 | # 如果tag只有一个 NavigableString 类型子节点,那么这个tag可以使用 .string 得到子节点:
192 | print(head_tag.string)
193 | # 如果tag包含了多个子节点,tag就无法确定 .string 方法应该调用哪个子节点的内容, .string 的输出结果是 None :
194 | print(soup.html.string)
195 | 
196 | 
197 | 
198 | 
199 | # .strings 和 stripped_strings
200 | # 如果tag中包含多个字符串 [2] ,可以使用 .strings 来循环获取:
201 | for string in soup.strings:
202 |     print(repr(string))
203 | 
204 | # 输出的字符串中可能包含了很多空格或空行,使用 .stripped_strings 可以去除多余空白内容:
205 | # [注]全部是空格的行会被忽略掉,段首和段末的空白会被删除
206 | for string in soup.stripped_strings:
207 |     print(repr(string))
208 | 
209 | # 通过 .parent 属性来获取某个元素的父节点.
210 | 
211 | 


--------------------------------------------------------------------------------