├── README.md
└── python3网络爬虫实战练习
    ├── 01-flask测试.py
    ├── 02-tornado测试.py
    ├── 03-BeautifulSoup.py
    ├── 04-pyquery.py
    ├── 05-数据存储.py
    ├── 06-json数据格式.py
    ├── 07-csv文件.py
    ├── 08-关系型数据库存储.py
    ├── 09-redis连接.py
    ├── 10-利用python模拟Ajax请求爬取微博详情.py
    ├── 11-分析Ajax爬取今日头条街拍美图.py
    ├── 12-selenium的使用.py
    ├── 13-selenium使用2.py
    ├── 14-splash使用.py
    ├── 15-使用selenium爬取淘宝商品.py
    ├── 16-中国知网验证码识别
        ├── __init__.py
        ├── captcha1.png
        ├── captcha2.png
        ├── code.jpg
        ├── 极验验证码.py
        └── 验证码识别.py
    ├── 16-爬取虎牙直播主播信息.py
    ├── 17-借助在线验证码识别平台完成点出验证码的识别
        ├── Chaojiying.py
        └── 点触验证码的识别.py
    ├── 19-TaoBaoMM
        ├── TaoBaoMM.py
        └── data
        │   └── result.db
    ├── 20-代理的使用.py
    ├── 3.jpg
    ├── data.csv
    ├── data
        ├── project.db
        ├── result.db
        ├── scheduler.1d
        ├── scheduler.1h
        ├── scheduler.all
        └── task.db
    ├── data1.csv
    └── ghostdriver.log


/README.md:
--------------------------------------------------------------------------------
1 | # Python3WebSpider-Test
2 | Python3网络爬虫实战练习
3 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/01-flask测试.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask
 2 | 
 3 | app=Flask(__name__)
 4 | 
 5 | @app.route("/")
 6 | def hello():
 7 |     return 'Hello World!'
 8 | 
 9 | 
10 | if __name__=='__main__':
11 |     app.run()


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/02-tornado测试.py:
--------------------------------------------------------------------------------
 1 | import tornado.ioloop
 2 | import tornado.web
 3 | 
 4 | class MainHandler(tornado.web.RequestHandler):
 5 |     def get(self):
 6 |         self.write('Hello World!')
 7 | 
 8 | def make_app():
 9 |     return tornado.web.Application([(r"/",MainHandler),])
10 | 
11 | if __name__=='__main__':
12 |     app=make_app()
13 |     app.listen(8888)
14 |     tornado.ioloop.IOLoop.current().start()


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/03-BeautifulSoup.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup              #利用BeautifulSoup将网页的代码按照标准的缩进打印出来
 3 | import re
 4 | 
 5 | # response=requests.get('http://www.baidu.com')
 6 | # response.encoding='utf-8'
 7 | # print(response.text)
 8 | 
 9 | # 基本用法
10 | html=requests.get('https://www.baidu.com')
11 | soup=BeautifulSoup(html.content,'lxml')     #注意这里的html.content,不能直接传入html
12 | print(soup.prettify())
13 | print(soup.title.string)
14 | 
15 | # 选择元素
16 | print(soup.title)
17 | print(soup.p)
18 | 
19 | # 提取信息
20 | print(soup.title.name)    # 获取名称
21 | print(soup.p.attrs)       # 获取属性
22 | print(soup.p.attrs['name'])
23 | 
24 | # 获取内容
25 | print(soup.p.string)
26 | 
27 | # 嵌套选择
28 | print(soup.head.title)
29 | print(type(soup.head.title))
30 | print(soup.head.title.string)
31 | 
32 | # 关联选择
33 | # 子节点和子孙节点
34 | print(soup.p.contents)   # 获取p节点的直接子节点(以列表形式返回)，即子孙节点
35 | print(soup.p.children)   # 返回结果是生成器类型，接下来用for循环
36 | for i,child in enumerate(soup.p.children):
37 |     print(i,child)
38 | 
39 | print(soup.p.descendants)   # 获取子孙节点
40 | for i,child in enumerate(soup.p.descendants):
41 |     print(i,child)
42 | 
43 | # 父节点和祖先节点
44 | print(soup.a.parent)     # 获取a节点的直接父节点
45 | print(soup.a.parents)     # 获取a节点的祖先节点
46 | 
47 | # 兄弟节点
48 | print(soup.a.next_sibling)
49 | print(soup.a.previous_sibling)
50 | 
51 | # 提取信息
52 | print(soup.a.next_sibling.string)
53 | print(soup.a.previous_sibling.text)
54 | 
55 | # 方法选择器
56 | # find_all(name,attrs,recursive,text,**kwargs)
57 | print(soup.find_all('ul'))
58 | for ul in soup.find_all(name='ul'):
59 |     print(ul.find_all(name='li'))
60 | 
61 | print(soup.find_all(attrs={'name':'list-1'}))    # 返回所有匹配的元素组成的列表
62 | print(soup.find_all(id='list-1'))
63 | 
64 | print(soup.find_all(text=re.compile('link')))
65 | 
66 | print(soup.find(name='ul'))    # 返回第一个匹配的元素
67 | 
68 | # CSS选择器
69 | # print(soup.select('ul li'))
70 | # print(soup.select(#list-2 .element))
71 | # print(soup.select('.panel .panel-heading'))
72 | 
73 | # 获取属性
74 | for ul in soup.select('ul'):
75 |     print(ul['id'])
76 |     print(ul.attrs['id'])
77 | 
78 | # 获取文本
79 | for li in soup.select('li'):
80 |     print(li.get_text())
81 |     print(li.string)
82 |     print(li.string)


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/04-pyquery.py:
--------------------------------------------------------------------------------
  1 | from pyquery import PyQuery as pq
  2 | import requests
  3 | 
  4 | # URL初始化
  5 | doc=pq(url='https://www.cuiqingcai.com')    #请求url，得到结果
  6 | doc=pq(requests.get('https://www.cuiqingcai.com').text)   #等价于上面那句
  7 | print(doc('title'))
  8 | 
  9 | # 文件初始化
 10 | doc=pq(filename=demo.html)
 11 | print(doc('li'))
 12 | 
 13 | # 基本CSS选择器
 14 | html='''
 15 | 
 16 | '''
 17 | doc=pq(html)
 18 | print(doc('#container .list li'))
 19 | print(type(doc('#container .list li')))
 20 | 
 21 | # 查找节点
 22 | # 子节点
 23 | html='''
 24 | 
 25 | '''
 26 | doc=pq(html)
 27 | items=doc('.list')
 28 | print(type(items))
 29 | print(items)
 30 | lis=items.find('li')     # 查找所有的子孙节点
 31 | # lis=items.children()     # 查找所有的子节点
 32 | # lis=items.children('.active')    # 查找字节点中class为active的节点
 33 | print(type(lis))
 34 | print(lis)
 35 | 
 36 | 
 37 | # 父节点
 38 | doc=pq(html)
 39 | items=doc('.list')
 40 | container=items.parent()    # 查找父节点
 41 | # container=items.parents()   # 查找祖先节点
 42 | print(type(container))
 43 | print(container)
 44 | 
 45 | # # 兄弟节点
 46 | doc=pq(html)
 47 | li=doc('.list .item-0.active')    # 选取class为list的节点内部class为item-0和active的节点
 48 | print(li.siblings())        # 查找兄弟节点
 49 | 
 50 | 
 51 | # 遍历(多个节点)，调用items方法
 52 | doc=pq(html)
 53 | lis=doc('li').items()
 54 | print(type(lis))
 55 | for li in lis:
 56 |     print(li,type(li))
 57 | 
 58 | # 获取信息
 59 | # 获取属性
 60 | doc=pq(html)
 61 | a=doc('.item-0.active')
 62 | print(a,type(a))
 63 | print(a.attr('href'))      # 与下面等价
 64 | # print(a.attr.hrer())
 65 | 
 66 | # 获取多个属性，需要遍历才能实现
 67 | doc=pq(html)
 68 | a=doc('a')
 69 | for i in a.items():
 70 |     print(a.attr('href'))
 71 | 
 72 | # 获取文本(调用text()方法实现)
 73 | doc=pq(html)
 74 | a=doc('.item-0.active a')
 75 | print(a)
 76 | print(a.text())    # 去掉节点的包括的所有HTML，只返回纯文字内容
 77 | 
 78 | # 获取HTML内容
 79 | doc=pq(html)
 80 | li=doc('.item-0.active')
 81 | print(li)
 82 | print(li.html)
 83 | 
 84 | # 如果选取的是多个节点，text()或html()返回的是第一个li节点的内部的HTML文本，而text()返回的是所有的li节点内部的纯文本
 85 | 
 86 | # 节点操作,比如为某个节点移除或者增添一个class
 87 | doc=pq(html)
 88 | li=doc('.item-0.active')
 89 | print(li)
 90 | li.remove_class('active')
 91 | print(li)
 92 | li.add_class('active')
 93 | 
 94 | # attr,text和html(改变属性，文本，以及html)
 95 | doc=pq(html)
 96 | li=doc('.item-0.active')
 97 | li.attr('name','link')
 98 | print(li)
 99 | li.text('changed item')
100 | print(li)
101 | li.html('<span>changed item</span>')
102 | print(li)
103 | 
104 | 
105 | # remove()
106 | # wrap.find('p').remove()
107 | # print(wrip.text())
108 | 
109 | # 伪类选择器
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/05-数据存储.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from pyquery import PyQuery as pq
 3 | 
 4 | url='https://www.zhihu.com/'
 5 | headers={
 6 |     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36'
 7 |     }
 8 | html=requests.get(url=url,headers=headers).text
 9 | doc=pq(html)
10 | # print(doc)
11 | items=doc('.explore-tab .feed-item').items()
12 | print(items)
13 | for item in items:
14 |     question=item.find('h2').text()
15 |     print('1')
16 |     author=item.find('.author-link-line').text()
17 |     answer=pq(item.find('.content').html()).text()
18 |     with open('explore.txt','a') as f:
19 |         f.write('\n'.join([question,author,answer]))
20 |         f.write('\n'+'='*50+'\n')
21 |         f.close()
22 | 
23 | 
24 | # response2 = requests.get('https://p4.ssl.cdn.btime.com/t01f7081c44b722510b.jpg')
25 | #
26 | # with open('3.jpg','wb') as f:
27 | #     f.write(response2.content)
28 | #     f.close()
29 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/06-json数据格式.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | #json.loads()可以将文本字符串转化为json对象，可以是列表，也可以使是符串
 4 | #json.dumps()可以将json对象转化为文本字符串
 5 | #json的数据需要用双引号来包围，不能使用单引号，否则loads()方法会解析失败
 6 | 
 7 | str='''[{
 8 |     "name":"job",      #注意数据必须是以双引号包围，不然json.loads()则会出现解析错误
 9 |     "age":"20",
10 |     "gender":"boy"
11 |     },{
12 |     "name":"jack",
13 |     "age":"22",
14 |     "gender":"boy"
15 |     }]'''
16 | print(type(str))
17 | data=json.loads(str)
18 | print(type(data))
19 | print(data)
20 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/07-csv文件.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import pandas as pd        #利用pandas读取csv数据文件
 3 | 
 4 | #csv，其文件以纯文本的形式存储表格数据，相当于一个结构化表的纯文本形式
 5 | #写入
 6 | with open('data.csv','w') as csvfile:            #打开csv文件，获得文件句柄
 7 |     writer=csv.writer(csvfile)                   #初始化传入对象，传入该句柄
 8 |     writer=csv.writer(csvfile,delimiter=' ')     #可以修改数据之间的分隔符，默认是逗号
 9 |     writer.writerow(['id','name','age'])         #以writerow()传入每行的数据
10 |     writer.writerow(['1','job','20'])
11 |     writer.writerow(['2','jack','22'])
12 | 
13 | 
14 | #写入字典格式的数据
15 | with open('data1.csv','w',encoding='utf-8') as csvfile:
16 |     fieldnames=['id','name','age']                              #先定义三个字段，用filenames表示
17 |     writer=csv.DictWriter(csvfile,fieldnames=fieldnames)        #将字段传入给Dictwriter来初始化一个字典写入对象
18 |     writer.writeheader()                                        #写入头信息
19 |     writer.writerow({'id':'100','name':'job','age':22})         #传入相应字段
20 |     writer.writerow({'id':'101','name':'tom','age':32})
21 |     writer.writerow({'id':'102','name':'mary','age':25})
22 | 
23 | #读取
24 | with open('data1.csv','r',encoding='utf-8') as csvfile:
25 |     reader=csv.reader(csvfile)
26 |     for row in reader:
27 |         print(row)
28 | 
29 | 
30 | 
31 | df=pd.read_csv('data1.csv')
32 | print(df)
33 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/08-关系型数据库存储.py:
--------------------------------------------------------------------------------
 1 | #连接数据库
 2 | import pymysql
 3 | 
 4 | # db=pymysql.connect(host='localhost',user='root',password='748491',port=3306)
 5 | # cursor=db.cursor()
 6 | # cursor.execute('select version()')
 7 | # data=cursor.fetchone()
 8 | # print('database version:',data)
 9 | # cursor.execute('create database spiders default character set utf8')
10 | # db.close()
11 | 
12 | #创建表
13 | id='101'
14 | user='job'
15 | age='22'
16 | db=pymysql.connect(host='localhost',user='root',password='748491',port=3306,db='spiders')
17 | #sql='create table if not exists students (id varchar(255) not null,name varchar(255) not null,age int not null ,primary key(id))'
18 | #注意传入数据的时候尽量使用这种格式化符的形式，有几个value写几个%s,只需要在execute()方法的第一个参数传入该sql语句，value值统一传一个元组就好了
19 | sql='insert into students(id,name,age) values(%s,%s,%s)'
20 | cursor=db.cursor()
21 | # cursor.execute(sql1)
22 | try:
23 |     cursor.execute(sql,(id,user,age))
24 |     db.commit()          #必须执行commit()方法才能将数据插入数据库中
25 | except:
26 |     db.rollback()        #异常处理，如果插入数据失败，就执行一次回滚，相当于什么都没有发生过
27 | db.close()
28 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/09-redis连接.py:
--------------------------------------------------------------------------------
 1 | from redis import StrictRedis,ConnectionPool
 2 | 
 3 | 
 4 | redis=StrictRedis(host='localhost',port=6379,db=0,password=None)    #两种连接方式都可以
 5 | # pool=ConnectionPool(host='localhost',port=6379,db=0,password=None)
 6 | # redis=StrictRedis(connection_pool=pool)
 7 | redis.set('name','job')
 8 | redis.set('age',22)
 9 | print(redis.get('name'))
10 | print(redis.exists('name'))
11 | redis.delete('age')
12 | print(type('name'))
13 | print(redis.keys('n*'))
14 | print(redis.randomkey())
15 | redis.move('name',2)
16 | # redis.flushdb()       #删除当前选择数据库中的所有的键
17 | # redis.flushall()      #删除所有数据库中的所有的键
18 | print(redis.dbsize())        #获取当前数据库中的所有键的数目
19 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/10-利用python模拟Ajax请求爬取微博详情.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from urllib.parse import urlencode
 3 | from pyquery import PyQuery as pq
 4 | from pymongo import MongoClient
 5 | 
 6 | client=MongoClient()        #连接moongodb数据库，定义一些需要用到的变量
 7 | db=client['weibo']
 8 | collection=db['weibo']
 9 | 
10 | base_url=''
11 | headers={
12 |     'Host: m.weibo.cn',
13 |     'Referer: https://m.weibo.cn/',
14 |     'User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36',
15 |     'X-Requested-With: XMLHttpRequest'
16 |     }
17 | 
18 | def get_page(page):               #定义一个获取ajax请求内容的函数，结果返回json数据格式
19 |     params={
20 |         'type':'uid',
21 |         'value':'',
22 |         'contained':'',
23 |         'page':page
24 |     }
25 |     url=base_url+urlencode(params)
26 |     try:
27 |         response=requests.get(url,headers=headers)
28 |         if response.status_code==200:
29 |             return response.json()
30 |     except requests.ConnectionError as e:
31 |         print('Error',e.args)
32 | 
33 | def parse_page(json):      #解析返回的json数据，并返回一个字典，包括微博的id，正文，赞数，评论数以及转发数
34 |     if json:
35 |         items=json.get('data').get('cards')
36 |         for item in items:
37 |             item=item.get('mblog')
38 |             weibo={}
39 |             weibo['id']=item.get()
40 |             weibo['text']=pq(item.get('text')).text()        #借助pquery将正文中的html去掉
41 |             weibo['attitudes']=item.get('attitudes_count')
42 |             weibo['comments']=item.get('comments_count')
43 |             weibo['reposts']=item.get('reposts_count')
44 |             yield weibo
45 | 
46 | def save_to_mongo(result):       #定义一个将数据存到mongodb数据库的方法
47 |     if collection.insert(result):
48 |         print('Save to Mongo!')
49 | 
50 | if __name__=='__main__':
51 |     for page in range(1,11):        #遍历一下page，一共10页，将提取到的结果打印出来
52 |         json=get_page(page)
53 |         results=parse_page(json)
54 |         for result in results:
55 |             print(result)
56 |             save_to_mongo(result)


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/11-分析Ajax爬取今日头条街拍美图.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from urllib.parse import urlencode
 3 | import os
 4 | from hashlib import md5
 5 | from multiprocessing.pool import Pool
 6 | from redis import StrictRedis
 7 | 
 8 | 
 9 | redis=StrictRedis(host='localhost',port=6379,db=0,password=None)
10 | 
11 | def get_one_page(offset):      #实现方法来加载单个Ajax请求的结果，返回json的字符串格式
12 |     params={
13 |         'offset':offset,
14 |         'format':'json',
15 |         'keyword':'街拍',
16 |         'autoload':'true',
17 |         'count':'20',
18 |         'cur_tab':'1',
19 |     }
20 |     url='https://www.toutiao.com/search_content/?'+urlencode(params)
21 |     try:
22 |         response=requests.get(url)
23 |         if response.status_code==200:
24 |             return response.json()
25 |     except requests.ConnectionError as e:
26 |         return None
27 | 
28 | def get_images(json):       #实现一个解析方法，提取每条数据的image_list字段中的每一张图片的链接，将图片链接和图片所属的标题一并返回，可以构造一个生成器
29 |     if json.get('data'):
30 |         for item in json.get('data'):
31 |             title=item.get('title')
32 |             images=item.get('image_list')
33 |             if images:
34 |                 for image in images:
35 |                     yield{
36 |                         'image':'http:'+image.get('url'),
37 |                         'title':title,
38 |                     }
39 |             else:
40 |                 return None
41 | #定义一个保存图片的方法，首先根据item的title来创建文件夹，然后请求这个图片链接，获取图片的二进制数据，以二进制的形式写
42 | #入文件。图片的名称可以使用其内容的MD5值，这样可以去除重复。
43 | def save_image(item):
44 |     if not os.path.exists(item.get('title')):
45 |         os.mkdir(item.get('title'))
46 |     try:
47 |         response=requests.get(item.get('image'))
48 |         if response.status_code==200:
49 |             file_path='{0}/{1}{2}'.format(item.get('title'),md5(response.content).hexdigest(),'.jpg')
50 |             if not os.path.exists(file_path):
51 |                 with open(file_path,'wb') as f:
52 |                     f.write(response.content)
53 |             else:
54 |                 print('Already Downloaded',file_path)
55 |         # save_to_redis(f)
56 |     except requests.ConnectionError:
57 |         print('Failed to Save Image!')
58 |     except requests.exceptions.MissingSchema as rem:
59 |         print(rem)
60 | 
61 | # def save_to_redis(f):       #定义一个将数据存到mongodb数据库的方法
62 | #     if redis.insert(f):
63 | #         print('Save to Redid!')
64 | 
65 | 
66 | def main(offset):
67 |     json=get_one_page(offset)
68 |     for item in get_images(json):
69 |         print(item)
70 |         save_image(item)
71 | 
72 | 
73 | GROUP_START=1
74 | GROUP_END=20
75 | if __name__=='__main__':
76 |     pool=Pool()
77 |     groups=([x*20 for x in range(GROUP_START,GROUP_END+1)])
78 |     pool.map(main,groups)        #利用多线程的线程池，调用其map方法实现多线程下载
79 |     pool.close()
80 |     pool.join()
81 | 
82 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/12-selenium的使用.py:
--------------------------------------------------------------------------------
 1 | from selenium import  webdriver
 2 | from selenium.webdriver.common.by import By
 3 | from selenium.webdriver.common.keys import Keys
 4 | from selenium.webdriver.support import expected_conditions as EC
 5 | from selenium.webdriver.support.wait import WebDriverWait
 6 | 
 7 | 
 8 | #通过selenium来驱动浏览器加载网页的话，可以直接拿到JavaScript渲染的结果了，不用担心使用的是什么加密结果
 9 | browser=webdriver.Chrome()
10 | try:
11 |     browser.get('https://www.baidu.com')
12 |     # input_=browser.find_elements_by_id('kw')
13 |     # input_.send_keys('Python')
14 |     # input_.send_keys(Keys.ENTER)
15 |     # wait=WebDriverWait(browser,10)
16 |     # wait.until(EC.presence_of_element_located((By.ID,'content-left')))
17 |     print(browser.current_url)
18 |     print(browser.get_cookies())
19 |     print(browser.page_source)
20 | finally:
21 |     browser.close()
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/13-selenium使用2.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | import time
  3 | from selenium.webdriver import ActionChains    #动作链
  4 | 
  5 | #查找节点并访问页面
  6 | # browser=webdriver.Chrome()
  7 | # browser.get('http://www.taobao.com')
  8 | # lis=browser.find_elements_by_css_selector('body > div.screen-outer.clearfix > div.main > div.main-inner.clearfix > div.tbh-service.J_Module > div > ul > li:nth-child(3) > a:nth-child(1)')
  9 | # print(lis)
 10 | # browser.close()
 11 | 
 12 | #节点交互
 13 | # browser=webdriver.Chrome()
 14 | # browser.get('https://www.taobao.com')
 15 | # input=browser.find_element_by_id('q')
 16 | # input.send_keys('iphone')
 17 | # time.sleep(3)
 18 | # input.clear()
 19 | # input.send_keys('iPad')
 20 | # button=browser.find_element_by_class_name('btn-search')
 21 | # button.click()
 22 | # time.sleep(3)
 23 | # browser.close()
 24 | 
 25 | #动作链,实现一个节点的拖拽
 26 | # browser=webdriver.Chrome()
 27 | # url=''
 28 | # browser.get(url)
 29 | # browser.switch_to.frame('')
 30 | # source=browser.find_element_by_css_selector('#draggeable')
 31 | # target=browser.find_element_by_css_selector('#droppable')
 32 | # actions=ActionChains(browser)
 33 | # actions.drag_and_drop(source,target)
 34 | # actions.perform()
 35 | 
 36 | #执行JavaScript(比如下拉进度条)
 37 | # from selenium import webdriver
 38 | #
 39 | # browser=webdriver.Chrome()
 40 | # browser.get('http://www.zhihu.com/explore')
 41 | # browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')    #执行下拉进度条的脚本
 42 | # browser.execute_script('alert("To Bottom")')                               #下拉到底部之后弹出一段提示
 43 | # browser.close()
 44 | 
 45 | #获取节点信息
 46 | 
 47 | #获取属性(前提是先选中这个节点)
 48 | from selenium import webdriver
 49 | from selenium.webdriver import ActionChains
 50 | 
 51 | # browser=webdriver.Chrome()
 52 | # browser.get('http://www.zhihu.com/explore')
 53 | # logo=browser.find_element_by_id('zh-top-link-logo')    #选中这个logo
 54 | # print(logo)
 55 | # print(logo.get_attribute('class'))
 56 | # browser.close()
 57 | 
 58 | #获取文本值
 59 | # from selenium import webdriver
 60 | #
 61 | # browser=webdriver.PhantomJS()
 62 | # browser.get('http://www.zhihu.com/explore')
 63 | # input=browser.find_element_by_class_name('zu-top-add-question')   #获取提问这个按钮，然后将此按钮的文本内容打印出来
 64 | # print(input.text)
 65 | #
 66 | # #获取ID，标签名
 67 | # print(input.id)
 68 | # print(input.location)
 69 | # print(input.tag_name)
 70 | # print(input.size)
 71 | 
 72 | #切换frame
 73 | # import time
 74 | # from selenium import webdriver
 75 | # from selenium.common.exceptions import NoSuchElementException
 76 | #
 77 | # browser=webdriver.Chrome()
 78 | # url='http://www.runoob.com/try/try.php?filenanme=jqueryui-api-droppable'
 79 | # browser.get(url)
 80 | # browser.switch_to.frame('iframeResult')   #切换至子级Frame里面，然后尝试获取父级里面的logo,找不到的话，抛出异常
 81 | # try:
 82 | #     logo=browser.find_element_by_class_name('logo')
 83 | # except NoSuchElementException:
 84 | #     print('No Logo!')
 85 | # browser.switch_to.parent_frame()          #切换至父级Frame里面，再次获取节点
 86 | # logo=browser.find_element_by_class_name('logo')
 87 | # print(logo)
 88 | # print(logo.text)
 89 | # browser.close()
 90 | 
 91 | #延时等待(显式等待，隐式等待)
 92 | #隐式等待,超出等待时间后，将抛出超时异常
 93 | # from selenium import  webdriver
 94 | # browser=webdriver.Chrome()
 95 | # browser.implicitly_wait(10)
 96 | # browser.get('http://www.zhihu.com/explore')
 97 | # input=browser.find_element_by_class_name('zu-top-add-question')
 98 | # print(input)
 99 | # browser.close()
100 | 
101 | #显式等待，指定要查找的节点，然后之指定一个最长等待时间，如果规定时间加载出来，就返回查找的节点，否则就抛出异常
102 | # from selenium import webdriver
103 | # from selenium.webdriver.common.by import By
104 | # from selenium.webdriver.support.ui import WebDriverWait
105 | # from selenium.webdriver.support import expected_conditions as EC
106 | #
107 | # browser=webdriver.Chrome()
108 | # browser.get('http://www.taobao.com')
109 | # wait=WebDriverWait(browser,10)
110 | # input=wait.until(EC.presence_of_element_located((By.ID,'q')))   #传入条件，表示等待节点出现，下面的表示节点按钮可点击
111 | # button=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.btn-search')))
112 | # print(input,button)
113 | # browser.close()
114 | 
115 | #前进和后退(back(),forward())
116 | # import time
117 | # from selenium import webdriver
118 | #
119 | # browser=webdriver.Chrome()
120 | # browser.get('http://www.baidu.com/')
121 | # browser.get('http://www.taobao.com/')
122 | # # browser.get('http://www.python.org/')
123 | # browser.back()
124 | # time.sleep(1)
125 | # browser.forward()
126 | # browser.close()
127 | 
128 | 
129 | #cookies
130 | # from selenium import webdriver
131 | #
132 | # browser=webdriver.Chrome()
133 | # browser.get('http://www.zhihu.com/explore')
134 | # print(browser.get_cookies())
135 | # browser.add_cookie({'name':'name','domain':'www.zhihu.com','value':'job'})
136 | # print(browser.get_cookies())
137 | # browser.delete_all_cookies()
138 | # print(browser.get_cookies())
139 | # browser.close()
140 | 
141 | #选项卡操作
142 | import time
143 | from selenium import webdriver
144 | 
145 | browser=webdriver.Chrome()
146 | browser.get('https://www.baidu.com')
147 | browser.execute_script('window.open()')               #新开启一个选项卡
148 | print(browser.window_handles)                         #获取当前所有开启的选项卡的代号列表
149 | browser.switch_to.window(browser.window_handles[1])   #切换选项卡
150 | browser.get('https://www.taobao.com')                 #在新开启的选项卡里面打开一个网页
151 | time.sleep(1)
152 | browser.switch_to.window(browser.window_handles[0])   #切回到原来的选项卡
153 | browser.get('https://www.python.org')                 #执行操作
154 | browser.close()


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/14-splash使用.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | 
 4 | #利用render.html获取经过JavaScript渲染之后的页面与源代码
 5 | 'curl http://localhost:8050/render.html?url=https://www.baidu.com'
 6 | 
 7 | 
 8 | 
 9 | #利用render.har获取页面加载的HAR数据，返回的是json数据
10 | 'curl http://localhost:8050/render.har?url=https://www.jd.com&wait=5'
11 | 
12 | 
13 | 
14 | #获取京东首页渲染完成之后的页面截图，并将其保存在本地（调用了splash的API render.png）
15 | url='http://localhost:8050/render.png?url=https://www,jd.com&wait=5&width=1000&height=700'
16 | response=requests.get(url)
17 | with open('taobao.png','wb') as f:
18 |     f.write(response.content)
19 | 
20 | 
21 | 
22 | #此接口包含前面借口的所有功能，返回json数据格式render.json
23 | 'curl http://localhost:8050/render.json?url=https://httpbin.org'
24 | 
25 | 
26 | 
27 | #此接口用于实现与Splash Lua脚本的对接（execute）
28 | #先实现一个Lua一个最简单的脚本
29 | # function main(splash)
30 | #     return 'Hello'
31 | # end
32 | #然后将此脚本转化为URL编码的字符串，拼接到execute接口后面
33 | 'curl http://localhost:8050/execute?lua_source=function+main(.*?)end'
34 | #运行结果，输出Hello,利用python实现
35 | import requests
36 | from urllib.parse import quote
37 | lua='''
38 |     function main(splash)
39 |     return 'Hello'
40 | end
41 | '''
42 | url='http://localhost:8050/execucte?lua_source='+quote(lua)
43 | response=requests.get(url)
44 | print(response.text)


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/15-使用selenium爬取淘宝商品.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 在之前，我们已经会分析Ajax请求来抓取相关数据，但是并不是所有页面都可以通过分析Ajax来完成抓取。比如，淘宝
 3 | 它的页面的确也是通过Ajax获取的，但是这些Ajax比较复杂，可能会包含加密密钥，所以想通过自己构造Ajax参数，还
 4 | 时比较困难的，最快速的方法还是selenium。本节中，我们就通过selenium来模拟浏览器操作，抓取淘宝商品信息，并
 5 | 将其存储在Mongodb中。
 6 | '''
 7 | from selenium import webdriver
 8 | from selenium.common.exceptions import TimeoutException
 9 | from selenium.webdriver.common.by import By
10 | from selenium.webdriver.support import expected_conditions as EC
11 | from selenium.webdriver.support.wait import WebDriverWait
12 | from urllib.parse import quote
13 | from pyquery import PyQuery as pq
14 | from multiprocessing import Pool
15 | import pymongo
16 | 
17 | chrome_options=webdriver.ChromeOptions()
18 | chrome_options.add_argument('--headless')
19 | browser=webdriver.Chrome(chrome_options=chrome_options)   #设置chrome的无头模式(headless)
20 | # browser=webdriver.Chrome()
21 | wait=WebDriverWait(browser,10)
22 | KEYWORD='ipad'
23 | MONGO_URL='localhost'
24 | MONGO_DB='TAOBAO'
25 | MONGO_COLLECTION='products'
26 | client=pymongo.MongoClient(MONGO_URL)
27 | db=client[MONGO_DB]
28 | 
29 | 
30 | def index_page(page):        #定义一个获取索引页信息的函数
31 |     print('正在爬取第',page,'页')
32 |     try:
33 |         url='https://s.taobao.com/search?q='+quote(KEYWORD)
34 |         browser.get(url)
35 |         if page>1:
36 |             input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input')))
37 |             submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
38 |             input.clear()
39 |             input.send_keys(page)
40 |             submit.click()
41 |         wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page)))
42 |         wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item')))
43 |         get_products()
44 |     except TimeoutException:
45 |         index_page(page)
46 | 
47 | 
48 | def get_products():         #解析获取到的索引页的信息，将商品的信息从中提取出来
49 |     html=browser.page_source
50 |     doc=pq(html)
51 |     items=doc('#mainsrp-itemlist .items .item').items()
52 |     for item in items:
53 |         product= {
54 |             'image': item.find('.pic .img').attr('data-src'),
55 |             'price': item.find('.price').text(),
56 |             'deal': item.find('.deal-cnt').text(),
57 |             'title': item.find('.title').text(),
58 |             'shop': item.find('.shop').text(),
59 |             'location': item.find('.location').text()
60 |         }
61 |         print(product)
62 |         # save_to_mongo(product)
63 | 
64 | def save_to_mongo(result):        #定义一个存储到MONGODB数据库的方法
65 |     try:
66 |         if db[MONGO_COLLECTION].insert(result):
67 |             print('存储到Mongodb成功！')
68 |     except Exception:
69 |         print('存储到Mongodb失败！')
70 | 
71 | MAX_PAGE=100
72 | def main():                      #实现页码的遍历
73 |    for i in range(1,MAX_PAGE+1):
74 |        index_page(i)
75 | 
76 | if __name__=='__main__':
77 |     main()
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/16-中国知网验证码识别/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/16-中国知网验证码识别/__init__.py


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/16-中国知网验证码识别/captcha1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/16-中国知网验证码识别/captcha1.png


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/16-中国知网验证码识别/captcha2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/16-中国知网验证码识别/captcha2.png


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/16-中国知网验证码识别/code.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/16-中国知网验证码识别/code.jpg


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/16-中国知网验证码识别/极验验证码.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     极验验证码特点：首先点击按钮进行智能验证，如果验证不通过，则会弹出滑动验证的窗口，拖动滑块拼合图像进行验证，之后生成三个加密
  3 | 参数，通过表单提交到后台，后台还会进行一次验证。
  4 | 识别验证需要三步：
  5 | 1.模拟点击验证按钮
  6 | 2.识别滑动缺口的位置
  7 | 3.模拟拖动滑块
  8 | '''
  9 | import time
 10 | from io import BytesIO
 11 | from PIL import Image
 12 | from selenium import webdriver
 13 | from selenium.webdriver import ActionChains
 14 | from selenium.webdriver.common.by import By
 15 | from selenium.webdriver.support.ui import WebDriverWait
 16 | from selenium.webdriver.support import expected_conditions as EC
 17 | 
 18 | EMAIL = 'coolcooljob@163.com'
 19 | PASSWORD = 'zb748491'
 20 | BORDER = 6
 21 | INIT_LEFT = 60
 22 | 
 23 | 
 24 | class CrackGeetest():
 25 |     def __init__(self):
 26 |         self.url = 'https://account.geetest.com/login'
 27 |         self.browser = webdriver.Chrome()
 28 |         self.wait = WebDriverWait(self.browser, 20)
 29 |         self.email = EMAIL
 30 |         self.password = PASSWORD
 31 | 
 32 |     def __del__(self):
 33 |         self.browser.close()
 34 | 
 35 |     def get_geetest_button(self):
 36 |         """
 37 |         获取初始验证按钮
 38 |         :return:
 39 |         """
 40 |         button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_radar_tip')))
 41 |         return button
 42 | 
 43 |     def get_position(self):
 44 |         """
 45 |         获取验证码位置
 46 |         :return: 验证码位置元组
 47 |         """
 48 |         img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_canvas_img')))
 49 |         time.sleep(2)
 50 |         location = img.location
 51 |         size = img.size
 52 |         top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
 53 |             'width']
 54 |         return (top, bottom, left, right)
 55 | 
 56 |     def get_screenshot(self):
 57 |         """
 58 |         获取网页截图
 59 |         :return: 截图对象
 60 |         """
 61 |         screenshot = self.browser.get_screenshot_as_png()
 62 |         screenshot = Image.open(BytesIO(screenshot))
 63 |         return screenshot
 64 | 
 65 |     def get_slider(self):
 66 |         """
 67 |         获取滑块
 68 |         :return: 滑块对象
 69 |         """
 70 |         slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_slider_button')))
 71 |         return slider
 72 | 
 73 |     def get_geetest_image(self, name='captcha.png'):
 74 |         """
 75 |         获取验证码图片
 76 |         :return: 图片对象
 77 |         """
 78 |         top, bottom, left, right = self.get_position()
 79 |         print('验证码位置', top, bottom, left, right)
 80 |         screenshot = self.get_screenshot()
 81 |         captcha = screenshot.crop((left, top, right, bottom))
 82 |         captcha.save(name)
 83 |         return captcha
 84 | 
 85 |     def open(self):
 86 |         """
 87 |         打开网页输入用户名密码
 88 |         :return: None
 89 |         """
 90 |         self.browser.get(self.url)
 91 |         email = self.wait.until(EC.presence_of_element_located((By.ID, 'email')))
 92 |         password = self.wait.until(EC.presence_of_element_located((By.ID, 'password')))
 93 |         email.send_keys(self.email)
 94 |         password.send_keys(self.password)
 95 | 
 96 |     def get_gap(self, image1, image2):
 97 |         """
 98 |         获取缺口偏移量
 99 |         :param image1: 不带缺口图片
100 |         :param image2: 带缺口图片
101 |         :return:
102 |         """
103 |         left = 60
104 |         for i in range(left, image1.size[0]):
105 |             for j in range(image1.size[1]):
106 |                 if not self.is_pixel_equal(image1, image2, i, j):
107 |                     left = i
108 |                     return left
109 |         return left
110 | 
111 |     def is_pixel_equal(self, image1, image2, x, y):
112 |         """
113 |         判断两个像素是否相同
114 |         :param image1: 图片1
115 |         :param image2: 图片2
116 |         :param x: 位置x
117 |         :param y: 位置y
118 |         :return: 像素是否相同
119 |         """
120 |         # 取两个图片的像素点
121 |         pixel1 = image1.load()[x, y]
122 |         pixel2 = image2.load()[x, y]
123 |         threshold = 60
124 |         if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(
125 |                 pixel1[2] - pixel2[2]) < threshold:
126 |             return True
127 |         else:
128 |             return False
129 | 
130 |     def get_track(self, distance):
131 |         """
132 |         根据偏移量获取移动轨迹
133 |         :param distance: 偏移量
134 |         :return: 移动轨迹
135 |         """
136 |         # 移动轨迹
137 |         track = []
138 |         # 当前位移
139 |         current = 0
140 |         # 减速阈值
141 |         mid = distance * 4 / 5
142 |         # 计算间隔
143 |         t = 0.2
144 |         # 初速度
145 |         v = 0
146 | 
147 |         while current < distance:
148 |             if current < mid:
149 |                 # 加速度为正2
150 |                 a = 2
151 |             else:
152 |                 # 加速度为负3
153 |                 a = -3
154 |             # 初速度v0
155 |             v0 = v
156 |             # 当前速度v = v0 + at
157 |             v = v0 + a * t
158 |             # 移动距离x = v0t + 1/2 * a * t^2
159 |             move = v0 * t + 1 / 2 * a * t * t
160 |             # 当前位移
161 |             current += move
162 |             # 加入轨迹
163 |             track.append(round(move))
164 |         return track
165 | 
166 |     def move_to_gap(self, slider, track):
167 |         """
168 |         拖动滑块到缺口处
169 |         :param slider: 滑块
170 |         :param track: 轨迹
171 |         :return:
172 |         """
173 |         ActionChains(self.browser).click_and_hold(slider).perform()
174 |         for x in track:
175 |             ActionChains(self.browser).move_by_offset(xoffset=x, yoffset=0).perform()
176 |         time.sleep(0.5)
177 |         ActionChains(self.browser).release().perform()
178 | 
179 |     def login(self):
180 |         """
181 |         登录
182 |         :return: None
183 |         """
184 |         submit = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'login-btn')))
185 |         submit.click()
186 |         time.sleep(10)
187 |         print('登录成功')
188 | 
189 |     def crack(self):
190 |         # 输入用户名密码
191 |         self.open()
192 |         # 点击验证按钮
193 |         button = self.get_geetest_button()
194 |         button.click()
195 |         # 获取验证码图片
196 |         image1 = self.get_geetest_image('captcha1.png')
197 |         # 点按呼出缺口
198 |         slider = self.get_slider()
199 |         slider.click()
200 |         # 获取带缺口的验证码图片
201 |         image2 = self.get_geetest_image('captcha2.png')
202 |         # 获取缺口位置
203 |         gap = self.get_gap(image1, image2)
204 |         print('缺口位置', gap)
205 |         # 减去缺口位移
206 |         gap -= BORDER
207 |         # 获取移动轨迹
208 |         track = self.get_track(gap)
209 |         print('滑动轨迹', track)
210 |         # 拖动滑块
211 |         self.move_to_gap(slider, track)
212 | 
213 |         success = self.wait.until(
214 |             EC.text_to_be_present_in_element((By.CLASS_NAME, 'geetest_success_radar_tip_content'), '验证成功'))
215 |         print(success)
216 | 
217 |         # 失败后重试
218 |         if not success:
219 |             self.crack()
220 |         else:
221 |             self.login()
222 | 
223 | 
224 | if __name__ == '__main__':
225 |     crack = CrackGeetest()
226 |     crack.crack()
227 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/16-中国知网验证码识别/验证码识别.py:
--------------------------------------------------------------------------------
 1 | import tesserocr
 2 | from PIL import Image
 3 | 
 4 | 
 5 | #当识别得到验证码结果和实际图片内容有差异时，可以试着将图片处理一下，比如转灰度值，二值化，此外，还可以指定二值化的阈值
 6 | 
 7 | # image=Image.open('code.jpg')
 8 | # result=tesserort.image_to_text(image)
 9 | # print(result)
10 | 
11 | #直接将图片转化为文字
12 | # print(tesserocr.file_to_text('code.png'))
13 | 
14 | #若图片内容转化有差异，可做如下处理
15 | # image=Image.open('code.jpg')
16 | # image=image.convert('L')      #将图片转化为灰度图像
17 | # image.show()
18 | # image=image.convert('1')
19 | # image.show()     #将图片进行二值化处理
20 | 
21 | image=Image.convert('L')
22 | threshold=80    #设置二值化阈值
23 | table=[]
24 | for i in range(256):
25 |     if i<threshold:
26 |         table.append(o)
27 |     else:
28 |         table.append(1)
29 | 
30 | image=image.point(table,'1')
31 | image.show()      #将原来验证码中杂乱的线条去掉，使整个验证码变得黑白分明，再尝试重新识别验证码


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/16-爬取虎牙直播主播信息.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from pyquery import PyQuery as pq
 3 | 
 4 | 
 5 | url='http://www.huya.com/g/wzry'
 6 | def get_page(url):
 7 |     try:
 8 |         response=requests.get(url)
 9 |         if response.status_code==200:
10 |             return response.text
11 |             # print(response.text)
12 |         return None
13 |     except ConnectionError:
14 |         get_page(url)
15 | 
16 | def parse_page(html):
17 |     doc=pq(html)
18 |     items=doc('.js-responded .mod-list .box-bd .live-list .game-live-list').items()
19 |     for item in items:
20 |         if item:
21 |             info={
22 |                 'href':item.find('.tag-right').attr('href'),
23 |                 'title':item.find('.tag-right').attr('title').text(),
24 |                 'image':item.find('.img .pic').attr('src'),
25 |                 'num':item.find('.txt .num .js-num').text()
26 |             }
27 |             print(info)
28 |         else:
29 |             return None
30 | 
31 | 
32 | 
33 | 
34 | def main():
35 |     html=get_page(url)
36 |     print(html)
37 |     parse_page(html)
38 |     print(parse_page(html))
39 | 
40 | if __name__=='__main__':
41 |     main()


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/17-借助在线验证码识别平台完成点出验证码的识别/Chaojiying.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from hashlib import md5
 3 | 
 4 | 
 5 | class Chaojiying(object):
 6 | 
 7 |     def __init__(self, username, password, soft_id):
 8 |         self.username = username
 9 |         self.password = md5(password.encode('utf-8')).hexdigest()
10 |         self.soft_id = soft_id
11 |         self.base_params = {
12 |             'user': self.username,
13 |             'pass2': self.password,
14 |             'softid': self.soft_id,
15 |         }
16 |         self.headers = {
17 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
18 |         }
19 | 
20 |     def post_pic(self, im, codetype):
21 |         """
22 |         im: 图片字节
23 |         codetype: 题目类型 参考 http://www.chaojiying.com/price.html
24 |         """
25 |         params = {
26 |             'codetype': codetype,
27 |         }
28 |         params.update(self.base_params)
29 |         files = {'userfile': ('ccc.jpg', im)}
30 |         r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
31 |                           headers=self.headers)
32 |         return r.json()
33 | 
34 |     def report_error(self, im_id):
35 |         """
36 |         im_id:报错题目的图片ID
37 |         """
38 |         params = {
39 |             'id': im_id,
40 |         }
41 |         params.update(self.base_params)
42 |         r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
43 |         return r.json()
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/17-借助在线验证码识别平台完成点出验证码的识别/点触验证码的识别.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from io import BytesIO
  3 | from PIL import Image
  4 | from selenium import webdriver
  5 | from selenium.webdriver import ActionChains
  6 | from selenium.webdriver.common.by import By
  7 | from selenium.webdriver.support.ui import WebDriverWait
  8 | from selenium.webdriver.support import expected_conditions as EC
  9 | from Chaojiying import Chaojiying
 10 | 
 11 | EMAIL = 'cqc@cuiqingcai.com'
 12 | PASSWORD = ''
 13 | 
 14 | CHAOJIYING_USERNAME = 'Germey'
 15 | CHAOJIYING_PASSWORD = ''
 16 | CHAOJIYING_SOFT_ID = 893590
 17 | CHAOJIYING_KIND = 9102
 18 | 
 19 | 
 20 | class CrackTouClick():
 21 |     def __init__(self):
 22 |         self.url = 'http://admin.touclick.com/login.html'
 23 |         self.browser = webdriver.Chrome()
 24 |         self.wait = WebDriverWait(self.browser, 20)
 25 |         self.email = EMAIL
 26 |         self.password = PASSWORD
 27 |         self.chaojiying = Chaojiying(CHAOJIYING_USERNAME, CHAOJIYING_PASSWORD, CHAOJIYING_SOFT_ID)
 28 | 
 29 |     def __del__(self):
 30 |         self.browser.close()
 31 | 
 32 |     def open(self):
 33 |         """
 34 |         打开网页输入用户名密码
 35 |         :return: None
 36 |         """
 37 |         self.browser.get(self.url)
 38 |         email = self.wait.until(EC.presence_of_element_located((By.ID, 'email')))
 39 |         password = self.wait.until(EC.presence_of_element_located((By.ID, 'password')))
 40 |         email.send_keys(self.email)
 41 |         password.send_keys(self.password)
 42 | 
 43 |     def get_touclick_button(self):
 44 |         """
 45 |         获取初始验证按钮
 46 |         :return:
 47 |         """
 48 |         button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'touclick-hod-wrap')))
 49 |         return button
 50 | 
 51 |     def get_touclick_element(self):
 52 |         """
 53 |         获取验证图片对象
 54 |         :return: 图片对象
 55 |         """
 56 |         element = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'touclick-pub-content')))
 57 |         return element
 58 | 
 59 |     def get_position(self):
 60 |         """
 61 |         获取验证码位置
 62 |         :return: 验证码位置元组
 63 |         """
 64 |         element = self.get_touclick_element()
 65 |         time.sleep(2)
 66 |         location = element.location
 67 |         size = element.size
 68 |         top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
 69 |             'width']
 70 |         return (top, bottom, left, right)
 71 | 
 72 |     def get_screenshot(self):
 73 |         """
 74 |         获取网页截图
 75 |         :return: 截图对象
 76 |         """
 77 |         screenshot = self.browser.get_screenshot_as_png()
 78 |         screenshot = Image.open(BytesIO(screenshot))
 79 |         return screenshot
 80 | 
 81 |     def get_touclick_image(self, name='captcha.png'):
 82 |         """
 83 |         获取验证码图片
 84 |         :return: 图片对象
 85 |         """
 86 |         top, bottom, left, right = self.get_position()
 87 |         print('验证码位置', top, bottom, left, right)
 88 |         screenshot = self.get_screenshot()
 89 |         captcha = screenshot.crop((left, top, right, bottom))
 90 |         captcha.save(name)
 91 |         return captcha
 92 | 
 93 |     def get_points(self, captcha_result):
 94 |         """
 95 |         解析识别结果
 96 |         :param captcha_result: 识别结果
 97 |         :return: 转化后的结果
 98 |         """
 99 |         groups = captcha_result.get('pic_str').split('|')
100 |         locations = [[int(number) for number in group.split(',')] for group in groups]
101 |         return locations
102 | 
103 |     def touch_click_words(self, locations):
104 |         """
105 |         点击验证图片
106 |         :param locations: 点击位置
107 |         :return: None
108 |         """
109 |         for location in locations:
110 |             print(location)
111 |             ActionChains(self.browser).move_to_element_with_offset(self.get_touclick_element(), location[0],
112 |                                                                    location[1]).click().perform()
113 |             time.sleep(1)
114 | 
115 |     def touch_click_verify(self):
116 |         """
117 |         点击验证按钮
118 |         :return: None
119 |         """
120 |         button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'touclick-pub-submit')))
121 |         button.click()
122 | 
123 |     def login(self):
124 |         """
125 |         登录
126 |         :return: None
127 |         """
128 |         submit = self.wait.until(EC.element_to_be_clickable((By.ID, '_submit')))
129 |         submit.click()
130 |         time.sleep(10)
131 |         print('登录成功')
132 | 
133 |     def crack(self):
134 |         """
135 |         破解入口
136 |         :return: None
137 |         """
138 |         self.open()
139 |         # 点击验证按钮
140 |         button = self.get_touclick_button()
141 |         button.click()
142 |         # 获取验证码图片
143 |         image = self.get_touclick_image()
144 |         bytes_array = BytesIO()
145 |         image.save(bytes_array, format='PNG')
146 |         # 识别验证码
147 |         result = self.chaojiying.post_pic(bytes_array.getvalue(), CHAOJIYING_KIND)
148 |         print(result)
149 |         locations = self.get_points(result)
150 |         self.touch_click_words(locations)
151 |         self.touch_click_verify()
152 |         # 判定是否成功
153 |         success = self.wait.until(
154 |             EC.text_to_be_present_in_element((By.CLASS_NAME, 'touclick-hod-note'), '验证成功'))
155 |         print(success)
156 | 
157 |         # 失败后重试
158 |         if not success:
159 |             self.crack()
160 |         else:
161 |             self.login()
162 | 
163 | 
164 | if __name__ == '__main__':
165 |     crack = CrackTouClick()
166 |     crack.crack()
167 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/19-TaoBaoMM/TaoBaoMM.py:
--------------------------------------------------------------------------------
 1 | from pyspider.libs.base_handler import *
 2 | 
 3 | PAGE_START = 1
 4 | PAGE_END = 30
 5 | DIR_PATH = 'D:\Pycharm\python中操作mysql\测试'
 6 | 
 7 | 
 8 | class Handler(BaseHandler):
 9 |     crawl_config = {
10 |     }
11 | 
12 |     def __init__(self):
13 |         self.base_url = 'https://mm.taobao.com/json/request_top_list.htm?page='
14 |         self.page_num = PAGE_START
15 |         self.total_num = PAGE_END
16 |         self.deal = Deal()
17 | 
18 |     def on_start(self):
19 |         while self.page_num <= self.total_num:
20 |             url = self.base_url + str(self.page_num)
21 |             self.crawl(url, callback=self.index_page)
22 |             self.page_num += 1
23 | 
24 |     def index_page(self, response):
25 |         for each in response.doc('.lady-name').items():
26 |             self.crawl(each.attr.href, callback=self.detail_page, fetch_type='js')
27 | 
28 |     def detail_page(self, response):
29 |         domain = response.doc('.mm-p-domain-info li > span').text()
30 |         if domain:
31 |             page_url = 'https:' + domain
32 |             self.crawl(page_url, callback=self.domain_page)
33 | 
34 |     def domain_page(self, response):
35 |         name = response.doc('.mm-p-model-info-left-top dd > a').text()
36 |         dir_path = self.deal.mkDir(name)
37 |         brief = response.doc('.mm-aixiu-content').text()
38 |         if dir_path:
39 |             imgs = response.doc('.mm-aixiu-content img').items()
40 |             count = 1
41 |             self.deal.saveBrief(brief, dir_path, name)
42 |             for img in imgs:
43 |                 url = img.attr.src
44 |                 if url:
45 |                     extension = self.deal.getExtension(url)
46 |                     file_name = name + str(count) + '.' + extension
47 |                     count += 1
48 |                     self.crawl(img.attr.src, callback=self.save_img,
49 |                                save={'dir_path': dir_path, 'file_name': file_name})
50 | 
51 |     def save_img(self, response):
52 |         content = response.content
53 |         dir_path = response.save['dir_path']
54 |         file_name = response.save['file_name']
55 |         file_path = dir_path + '/' + file_name
56 |         self.deal.saveImg(content, file_path)
57 | 
58 | 
59 | import os
60 | 
61 | class Deal:
62 |     def __init__(self):
63 |         self.path = DIR_PATH
64 |         if not self.path.endswith('/'):
65 |             self.path = self.path + '/'
66 |         if not os.path.exists(self.path):
67 |             os.makedirs(self.path)
68 | 
69 |     def mkDir(self, path):
70 |         path = path.strip()
71 |         dir_path = self.path + path
72 |         exists = os.path.exists(dir_path)
73 |         if not exists:
74 |             os.makedirs(dir_path)
75 |             return dir_path
76 |         else:
77 |             return dir_path
78 | 
79 |     def saveImg(self, content, path):
80 |         f = open(path, 'wb')
81 |         f.write(content)
82 |         f.close()
83 | 
84 |     def saveBrief(self, content, dir_path, name):
85 |         file_name = dir_path + "/" + name + ".txt"
86 |         f = open(file_name, "w+")
87 |         f.write(content.encode('utf-8'))
88 | 
89 |     def getExtension(self, url):
90 |         extension = url.split('.')[-1]
91 |         return extension


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/19-TaoBaoMM/data/result.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/19-TaoBaoMM/data/result.db


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/20-代理的使用.py:
--------------------------------------------------------------------------------
 1 | #urllib
 2 | #输出结果是一个JSON，它有一个字段origin,表明了客户端的IP。
 3 | from urllib.error import URLError
 4 | from urllib.request import ProxyHandler,build_opener
 5 | 
 6 | proxy=''
 7 | # proxy='username:password@'     #若碰到需要认证的代理，可以进行这样的设置。
 8 | proxy_handler=ProxyHandler({
 9 |     'http':'http://'+proxy,
10 |     'https':'https://'+proxy,
11 | })
12 | opener=build_opener(proxy_handler)
13 | try:
14 |     response=opener.open('http://httpbin.org/get')
15 |     print(response.read().decode('utf-8'))
16 | except URLError as e:
17 |     print(e.reason)
18 | 
19 | #requests
20 | import requests
21 | 
22 | proxy=''
23 | proxy='username:password@'         #有用户验证的情况
24 | proxies={
25 |     'http':'http://'+proxy,
26 |     'https':'https://'+proxy,
27 | }
28 | try:
29 |     response=requests.get('http://httpbin.org/get',proxies=proxies)
30 |     print(response.text)
31 | except requests.exceptions.ConnectionError as e:
32 |     print('Error',e.args)
33 | 
34 | 
35 | #selenium(有界面：Chrome,无界面：Plantomjs)
36 | #Chrome
37 | from selenium import webdriver
38 | 
39 | proxy=''
40 | chrome_options=webdriver.ChromeOptions()
41 | chrome_options.add_argument('--proxy-server=http://'+proxy)
42 | browser=webdriver.Chrome(chrome_options=chrome_options)
43 | browser.get('http://httpbin.org/get')
44 | 
45 | #plantomjs
46 | from selenium import webdriver
47 | 
48 | service_args=[
49 |     '--proxy=',
50 |     '--proxy-type=http'
51 | ]
52 | # service_args=[
53 | #     '--proxy=127.0.0.1:9743',
54 | #     '--proxy-type=http',
55 | #     '--proxy-auth=username:password'
56 | # ]        #需要进行认证设置的时候
57 | browser=webdriver.PhantomJS(service_args=service_args)
58 | browser.get('http://httpbin.org/get')
59 | print(browser.page_source)


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/3.jpg


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data.csv:
--------------------------------------------------------------------------------
1 | id name age
2 | 1 job 20
3 | 2 jack 22
4 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data/project.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/project.db


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data/result.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/result.db


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data/scheduler.1d:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/scheduler.1d


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data/scheduler.1h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/scheduler.1h


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data/scheduler.all:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/scheduler.all


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data/task.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/task.db


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data1.csv:
--------------------------------------------------------------------------------
1 | id,name,age
2 | 100,job,22
3 | 101,tom,32
4 | 102,mary,25
5 | 


--------------------------------------------------------------------------------
/python3网络爬虫实战练习/ghostdriver.log:
--------------------------------------------------------------------------------
1 | [INFO  - 2018-05-19T03:48:03.597Z] GhostDriver - Main - running on port 52052
2 | [INFO  - 2018-05-19T03:48:08.043Z] Session [716db230-5b17-11e8-aebb-055f3c8bc10e] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true}
3 | [INFO  - 2018-05-19T03:48:08.043Z] Session [716db230-5b17-11e8-aebb-055f3c8bc10e] - page.customHeaders:  - {}
4 | [INFO  - 2018-05-19T03:48:08.043Z] Session [716db230-5b17-11e8-aebb-055f3c8bc10e] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-10-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}}
5 | [INFO  - 2018-05-19T03:48:08.044Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: 716db230-5b17-11e8-aebb-055f3c8bc10e
6 | 


--------------------------------------------------------------------------------