├── README.md
└── python3网络爬虫实战练习
├── 01-flask测试.py
├── 02-tornado测试.py
├── 03-BeautifulSoup.py
├── 04-pyquery.py
├── 05-数据存储.py
├── 06-json数据格式.py
├── 07-csv文件.py
├── 08-关系型数据库存储.py
├── 09-redis连接.py
├── 10-利用python模拟Ajax请求爬取微博详情.py
├── 11-分析Ajax爬取今日头条街拍美图.py
├── 12-selenium的使用.py
├── 13-selenium使用2.py
├── 14-splash使用.py
├── 15-使用selenium爬取淘宝商品.py
├── 16-中国知网验证码识别
├── __init__.py
├── captcha1.png
├── captcha2.png
├── code.jpg
├── 极验验证码.py
└── 验证码识别.py
├── 16-爬取虎牙直播主播信息.py
├── 17-借助在线验证码识别平台完成点出验证码的识别
├── Chaojiying.py
└── 点触验证码的识别.py
├── 19-TaoBaoMM
├── TaoBaoMM.py
└── data
│ └── result.db
├── 20-代理的使用.py
├── 3.jpg
├── data.csv
├── data
├── project.db
├── result.db
├── scheduler.1d
├── scheduler.1h
├── scheduler.all
└── task.db
├── data1.csv
└── ghostdriver.log
/README.md:
--------------------------------------------------------------------------------
1 | # Python3WebSpider-Test
2 | Python3网络爬虫实战练习
3 |
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/01-flask测试.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 |
3 | app=Flask(__name__)
4 |
5 | @app.route("/")
6 | def hello():
7 | return 'Hello World!'
8 |
9 |
10 | if __name__=='__main__':
11 | app.run()
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/02-tornado测试.py:
--------------------------------------------------------------------------------
1 | import tornado.ioloop
2 | import tornado.web
3 |
4 | class MainHandler(tornado.web.RequestHandler):
5 | def get(self):
6 | self.write('Hello World!')
7 |
8 | def make_app():
9 | return tornado.web.Application([(r"/",MainHandler),])
10 |
11 | if __name__=='__main__':
12 | app=make_app()
13 | app.listen(8888)
14 | tornado.ioloop.IOLoop.current().start()
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/03-BeautifulSoup.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup #利用BeautifulSoup将网页的代码按照标准的缩进打印出来
3 | import re
4 |
5 | # response=requests.get('http://www.baidu.com')
6 | # response.encoding='utf-8'
7 | # print(response.text)
8 |
9 | # 基本用法
10 | html=requests.get('https://www.baidu.com')
11 | soup=BeautifulSoup(html.content,'lxml') #注意这里的html.content,不能直接传入html
12 | print(soup.prettify())
13 | print(soup.title.string)
14 |
15 | # 选择元素
16 | print(soup.title)
17 | print(soup.p)
18 |
19 | # 提取信息
20 | print(soup.title.name) # 获取名称
21 | print(soup.p.attrs) # 获取属性
22 | print(soup.p.attrs['name'])
23 |
24 | # 获取内容
25 | print(soup.p.string)
26 |
27 | # 嵌套选择
28 | print(soup.head.title)
29 | print(type(soup.head.title))
30 | print(soup.head.title.string)
31 |
32 | # 关联选择
33 | # 子节点和子孙节点
34 | print(soup.p.contents) # 获取p节点的直接子节点(以列表形式返回),即子孙节点
35 | print(soup.p.children) # 返回结果是生成器类型,接下来用for循环
36 | for i,child in enumerate(soup.p.children):
37 | print(i,child)
38 |
39 | print(soup.p.descendants) # 获取子孙节点
40 | for i,child in enumerate(soup.p.descendants):
41 | print(i,child)
42 |
43 | # 父节点和祖先节点
44 | print(soup.a.parent) # 获取a节点的直接父节点
45 | print(soup.a.parents) # 获取a节点的祖先节点
46 |
47 | # 兄弟节点
48 | print(soup.a.next_sibling)
49 | print(soup.a.previous_sibling)
50 |
51 | # 提取信息
52 | print(soup.a.next_sibling.string)
53 | print(soup.a.previous_sibling.text)
54 |
55 | # 方法选择器
56 | # find_all(name,attrs,recursive,text,**kwargs)
57 | print(soup.find_all('ul'))
58 | for ul in soup.find_all(name='ul'):
59 | print(ul.find_all(name='li'))
60 |
61 | print(soup.find_all(attrs={'name':'list-1'})) # 返回所有匹配的元素组成的列表
62 | print(soup.find_all(id='list-1'))
63 |
64 | print(soup.find_all(text=re.compile('link')))
65 |
66 | print(soup.find(name='ul')) # 返回第一个匹配的元素
67 |
68 | # CSS选择器
69 | # print(soup.select('ul li'))
70 | # print(soup.select(#list-2 .element))
71 | # print(soup.select('.panel .panel-heading'))
72 |
73 | # 获取属性
74 | for ul in soup.select('ul'):
75 | print(ul['id'])
76 | print(ul.attrs['id'])
77 |
78 | # 获取文本
79 | for li in soup.select('li'):
80 | print(li.get_text())
81 | print(li.string)
82 | print(li.string)
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/04-pyquery.py:
--------------------------------------------------------------------------------
1 | from pyquery import PyQuery as pq
2 | import requests
3 |
4 | # URL初始化
5 | doc=pq(url='https://www.cuiqingcai.com') #请求url,得到结果
6 | doc=pq(requests.get('https://www.cuiqingcai.com').text) #等价于上面那句
7 | print(doc('title'))
8 |
9 | # 文件初始化
10 | doc=pq(filename=demo.html)
11 | print(doc('li'))
12 |
13 | # 基本CSS选择器
14 | html='''
15 |
16 | '''
17 | doc=pq(html)
18 | print(doc('#container .list li'))
19 | print(type(doc('#container .list li')))
20 |
21 | # 查找节点
22 | # 子节点
23 | html='''
24 |
25 | '''
26 | doc=pq(html)
27 | items=doc('.list')
28 | print(type(items))
29 | print(items)
30 | lis=items.find('li') # 查找所有的子孙节点
31 | # lis=items.children() # 查找所有的子节点
32 | # lis=items.children('.active') # 查找字节点中class为active的节点
33 | print(type(lis))
34 | print(lis)
35 |
36 |
37 | # 父节点
38 | doc=pq(html)
39 | items=doc('.list')
40 | container=items.parent() # 查找父节点
41 | # container=items.parents() # 查找祖先节点
42 | print(type(container))
43 | print(container)
44 |
45 | # # 兄弟节点
46 | doc=pq(html)
47 | li=doc('.list .item-0.active') # 选取class为list的节点内部class为item-0和active的节点
48 | print(li.siblings()) # 查找兄弟节点
49 |
50 |
51 | # 遍历(多个节点),调用items方法
52 | doc=pq(html)
53 | lis=doc('li').items()
54 | print(type(lis))
55 | for li in lis:
56 | print(li,type(li))
57 |
58 | # 获取信息
59 | # 获取属性
60 | doc=pq(html)
61 | a=doc('.item-0.active')
62 | print(a,type(a))
63 | print(a.attr('href')) # 与下面等价
64 | # print(a.attr.hrer())
65 |
66 | # 获取多个属性,需要遍历才能实现
67 | doc=pq(html)
68 | a=doc('a')
69 | for i in a.items():
70 | print(a.attr('href'))
71 |
72 | # 获取文本(调用text()方法实现)
73 | doc=pq(html)
74 | a=doc('.item-0.active a')
75 | print(a)
76 | print(a.text()) # 去掉节点的包括的所有HTML,只返回纯文字内容
77 |
78 | # 获取HTML内容
79 | doc=pq(html)
80 | li=doc('.item-0.active')
81 | print(li)
82 | print(li.html)
83 |
84 | # 如果选取的是多个节点,text()或html()返回的是第一个li节点的内部的HTML文本,而text()返回的是所有的li节点内部的纯文本
85 |
86 | # 节点操作,比如为某个节点移除或者增添一个class
87 | doc=pq(html)
88 | li=doc('.item-0.active')
89 | print(li)
90 | li.remove_class('active')
91 | print(li)
92 | li.add_class('active')
93 |
94 | # attr,text和html(改变属性,文本,以及html)
95 | doc=pq(html)
96 | li=doc('.item-0.active')
97 | li.attr('name','link')
98 | print(li)
99 | li.text('changed item')
100 | print(li)
101 | li.html('changed item')
102 | print(li)
103 |
104 |
105 | # remove()
106 | # wrap.find('p').remove()
107 | # print(wrip.text())
108 |
109 | # 伪类选择器
110 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/05-数据存储.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from pyquery import PyQuery as pq
3 |
4 | url='https://www.zhihu.com/'
5 | headers={
6 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36'
7 | }
8 | html=requests.get(url=url,headers=headers).text
9 | doc=pq(html)
10 | # print(doc)
11 | items=doc('.explore-tab .feed-item').items()
12 | print(items)
13 | for item in items:
14 | question=item.find('h2').text()
15 | print('1')
16 | author=item.find('.author-link-line').text()
17 | answer=pq(item.find('.content').html()).text()
18 | with open('explore.txt','a') as f:
19 | f.write('\n'.join([question,author,answer]))
20 | f.write('\n'+'='*50+'\n')
21 | f.close()
22 |
23 |
24 | # response2 = requests.get('https://p4.ssl.cdn.btime.com/t01f7081c44b722510b.jpg')
25 | #
26 | # with open('3.jpg','wb') as f:
27 | # f.write(response2.content)
28 | # f.close()
29 |
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/06-json数据格式.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | #json.loads()可以将文本字符串转化为json对象,可以是列表,也可以使是符串
4 | #json.dumps()可以将json对象转化为文本字符串
5 | #json的数据需要用双引号来包围,不能使用单引号,否则loads()方法会解析失败
6 |
7 | str='''[{
8 | "name":"job", #注意数据必须是以双引号包围,不然json.loads()则会出现解析错误
9 | "age":"20",
10 | "gender":"boy"
11 | },{
12 | "name":"jack",
13 | "age":"22",
14 | "gender":"boy"
15 | }]'''
16 | print(type(str))
17 | data=json.loads(str)
18 | print(type(data))
19 | print(data)
20 |
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/07-csv文件.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import pandas as pd #利用pandas读取csv数据文件
3 |
4 | #csv,其文件以纯文本的形式存储表格数据,相当于一个结构化表的纯文本形式
5 | #写入
6 | with open('data.csv','w') as csvfile: #打开csv文件,获得文件句柄
7 | writer=csv.writer(csvfile) #初始化传入对象,传入该句柄
8 | writer=csv.writer(csvfile,delimiter=' ') #可以修改数据之间的分隔符,默认是逗号
9 | writer.writerow(['id','name','age']) #以writerow()传入每行的数据
10 | writer.writerow(['1','job','20'])
11 | writer.writerow(['2','jack','22'])
12 |
13 |
14 | #写入字典格式的数据
15 | with open('data1.csv','w',encoding='utf-8') as csvfile:
16 | fieldnames=['id','name','age'] #先定义三个字段,用filenames表示
17 | writer=csv.DictWriter(csvfile,fieldnames=fieldnames) #将字段传入给Dictwriter来初始化一个字典写入对象
18 | writer.writeheader() #写入头信息
19 | writer.writerow({'id':'100','name':'job','age':22}) #传入相应字段
20 | writer.writerow({'id':'101','name':'tom','age':32})
21 | writer.writerow({'id':'102','name':'mary','age':25})
22 |
23 | #读取
24 | with open('data1.csv','r',encoding='utf-8') as csvfile:
25 | reader=csv.reader(csvfile)
26 | for row in reader:
27 | print(row)
28 |
29 |
30 |
31 | df=pd.read_csv('data1.csv')
32 | print(df)
33 |
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/08-关系型数据库存储.py:
--------------------------------------------------------------------------------
1 | #连接数据库
2 | import pymysql
3 |
4 | # db=pymysql.connect(host='localhost',user='root',password='748491',port=3306)
5 | # cursor=db.cursor()
6 | # cursor.execute('select version()')
7 | # data=cursor.fetchone()
8 | # print('database version:',data)
9 | # cursor.execute('create database spiders default character set utf8')
10 | # db.close()
11 |
12 | #创建表
13 | id='101'
14 | user='job'
15 | age='22'
16 | db=pymysql.connect(host='localhost',user='root',password='748491',port=3306,db='spiders')
17 | #sql='create table if not exists students (id varchar(255) not null,name varchar(255) not null,age int not null ,primary key(id))'
18 | #注意传入数据的时候尽量使用这种格式化符的形式,有几个value写几个%s,只需要在execute()方法的第一个参数传入该sql语句,value值统一传一个元组就好了
19 | sql='insert into students(id,name,age) values(%s,%s,%s)'
20 | cursor=db.cursor()
21 | # cursor.execute(sql1)
22 | try:
23 | cursor.execute(sql,(id,user,age))
24 | db.commit() #必须执行commit()方法才能将数据插入数据库中
25 | except:
26 | db.rollback() #异常处理,如果插入数据失败,就执行一次回滚,相当于什么都没有发生过
27 | db.close()
28 |
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/09-redis连接.py:
--------------------------------------------------------------------------------
1 | from redis import StrictRedis,ConnectionPool
2 |
3 |
4 | redis=StrictRedis(host='localhost',port=6379,db=0,password=None) #两种连接方式都可以
5 | # pool=ConnectionPool(host='localhost',port=6379,db=0,password=None)
6 | # redis=StrictRedis(connection_pool=pool)
7 | redis.set('name','job')
8 | redis.set('age',22)
9 | print(redis.get('name'))
10 | print(redis.exists('name'))
11 | redis.delete('age')
12 | print(type('name'))
13 | print(redis.keys('n*'))
14 | print(redis.randomkey())
15 | redis.move('name',2)
16 | # redis.flushdb() #删除当前选择数据库中的所有的键
17 | # redis.flushall() #删除所有数据库中的所有的键
18 | print(redis.dbsize()) #获取当前数据库中的所有键的数目
19 |
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/10-利用python模拟Ajax请求爬取微博详情.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from urllib.parse import urlencode
3 | from pyquery import PyQuery as pq
4 | from pymongo import MongoClient
5 |
6 | client=MongoClient() #连接moongodb数据库,定义一些需要用到的变量
7 | db=client['weibo']
8 | collection=db['weibo']
9 |
10 | base_url=''
11 | headers={
12 | 'Host: m.weibo.cn',
13 | 'Referer: https://m.weibo.cn/',
14 | 'User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36',
15 | 'X-Requested-With: XMLHttpRequest'
16 | }
17 |
18 | def get_page(page): #定义一个获取ajax请求内容的函数,结果返回json数据格式
19 | params={
20 | 'type':'uid',
21 | 'value':'',
22 | 'contained':'',
23 | 'page':page
24 | }
25 | url=base_url+urlencode(params)
26 | try:
27 | response=requests.get(url,headers=headers)
28 | if response.status_code==200:
29 | return response.json()
30 | except requests.ConnectionError as e:
31 | print('Error',e.args)
32 |
33 | def parse_page(json): #解析返回的json数据,并返回一个字典,包括微博的id,正文,赞数,评论数以及转发数
34 | if json:
35 | items=json.get('data').get('cards')
36 | for item in items:
37 | item=item.get('mblog')
38 | weibo={}
39 | weibo['id']=item.get()
40 | weibo['text']=pq(item.get('text')).text() #借助pquery将正文中的html去掉
41 | weibo['attitudes']=item.get('attitudes_count')
42 | weibo['comments']=item.get('comments_count')
43 | weibo['reposts']=item.get('reposts_count')
44 | yield weibo
45 |
46 | def save_to_mongo(result): #定义一个将数据存到mongodb数据库的方法
47 | if collection.insert(result):
48 | print('Save to Mongo!')
49 |
50 | if __name__=='__main__':
51 | for page in range(1,11): #遍历一下page,一共10页,将提取到的结果打印出来
52 | json=get_page(page)
53 | results=parse_page(json)
54 | for result in results:
55 | print(result)
56 | save_to_mongo(result)
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/11-分析Ajax爬取今日头条街拍美图.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from urllib.parse import urlencode
3 | import os
4 | from hashlib import md5
5 | from multiprocessing.pool import Pool
6 | from redis import StrictRedis
7 |
8 |
9 | redis=StrictRedis(host='localhost',port=6379,db=0,password=None)
10 |
11 | def get_one_page(offset): #实现方法来加载单个Ajax请求的结果,返回json的字符串格式
12 | params={
13 | 'offset':offset,
14 | 'format':'json',
15 | 'keyword':'街拍',
16 | 'autoload':'true',
17 | 'count':'20',
18 | 'cur_tab':'1',
19 | }
20 | url='https://www.toutiao.com/search_content/?'+urlencode(params)
21 | try:
22 | response=requests.get(url)
23 | if response.status_code==200:
24 | return response.json()
25 | except requests.ConnectionError as e:
26 | return None
27 |
28 | def get_images(json): #实现一个解析方法,提取每条数据的image_list字段中的每一张图片的链接,将图片链接和图片所属的标题一并返回,可以构造一个生成器
29 | if json.get('data'):
30 | for item in json.get('data'):
31 | title=item.get('title')
32 | images=item.get('image_list')
33 | if images:
34 | for image in images:
35 | yield{
36 | 'image':'http:'+image.get('url'),
37 | 'title':title,
38 | }
39 | else:
40 | return None
41 | #定义一个保存图片的方法,首先根据item的title来创建文件夹,然后请求这个图片链接,获取图片的二进制数据,以二进制的形式写
42 | #入文件。图片的名称可以使用其内容的MD5值,这样可以去除重复。
43 | def save_image(item):
44 | if not os.path.exists(item.get('title')):
45 | os.mkdir(item.get('title'))
46 | try:
47 | response=requests.get(item.get('image'))
48 | if response.status_code==200:
49 | file_path='{0}/{1}{2}'.format(item.get('title'),md5(response.content).hexdigest(),'.jpg')
50 | if not os.path.exists(file_path):
51 | with open(file_path,'wb') as f:
52 | f.write(response.content)
53 | else:
54 | print('Already Downloaded',file_path)
55 | # save_to_redis(f)
56 | except requests.ConnectionError:
57 | print('Failed to Save Image!')
58 | except requests.exceptions.MissingSchema as rem:
59 | print(rem)
60 |
61 | # def save_to_redis(f): #定义一个将数据存到mongodb数据库的方法
62 | # if redis.insert(f):
63 | # print('Save to Redid!')
64 |
65 |
66 | def main(offset):
67 | json=get_one_page(offset)
68 | for item in get_images(json):
69 | print(item)
70 | save_image(item)
71 |
72 |
73 | GROUP_START=1
74 | GROUP_END=20
75 | if __name__=='__main__':
76 | pool=Pool()
77 | groups=([x*20 for x in range(GROUP_START,GROUP_END+1)])
78 | pool.map(main,groups) #利用多线程的线程池,调用其map方法实现多线程下载
79 | pool.close()
80 | pool.join()
81 |
82 |
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/12-selenium的使用.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium.webdriver.common.by import By
3 | from selenium.webdriver.common.keys import Keys
4 | from selenium.webdriver.support import expected_conditions as EC
5 | from selenium.webdriver.support.wait import WebDriverWait
6 |
7 |
8 | #通过selenium来驱动浏览器加载网页的话,可以直接拿到JavaScript渲染的结果了,不用担心使用的是什么加密结果
9 | browser=webdriver.Chrome()
10 | try:
11 | browser.get('https://www.baidu.com')
12 | # input_=browser.find_elements_by_id('kw')
13 | # input_.send_keys('Python')
14 | # input_.send_keys(Keys.ENTER)
15 | # wait=WebDriverWait(browser,10)
16 | # wait.until(EC.presence_of_element_located((By.ID,'content-left')))
17 | print(browser.current_url)
18 | print(browser.get_cookies())
19 | print(browser.page_source)
20 | finally:
21 | browser.close()
22 |
23 |
24 |
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/13-selenium使用2.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | import time
3 | from selenium.webdriver import ActionChains #动作链
4 |
5 | #查找节点并访问页面
6 | # browser=webdriver.Chrome()
7 | # browser.get('http://www.taobao.com')
8 | # lis=browser.find_elements_by_css_selector('body > div.screen-outer.clearfix > div.main > div.main-inner.clearfix > div.tbh-service.J_Module > div > ul > li:nth-child(3) > a:nth-child(1)')
9 | # print(lis)
10 | # browser.close()
11 |
12 | #节点交互
13 | # browser=webdriver.Chrome()
14 | # browser.get('https://www.taobao.com')
15 | # input=browser.find_element_by_id('q')
16 | # input.send_keys('iphone')
17 | # time.sleep(3)
18 | # input.clear()
19 | # input.send_keys('iPad')
20 | # button=browser.find_element_by_class_name('btn-search')
21 | # button.click()
22 | # time.sleep(3)
23 | # browser.close()
24 |
25 | #动作链,实现一个节点的拖拽
26 | # browser=webdriver.Chrome()
27 | # url=''
28 | # browser.get(url)
29 | # browser.switch_to.frame('')
30 | # source=browser.find_element_by_css_selector('#draggeable')
31 | # target=browser.find_element_by_css_selector('#droppable')
32 | # actions=ActionChains(browser)
33 | # actions.drag_and_drop(source,target)
34 | # actions.perform()
35 |
36 | #执行JavaScript(比如下拉进度条)
37 | # from selenium import webdriver
38 | #
39 | # browser=webdriver.Chrome()
40 | # browser.get('http://www.zhihu.com/explore')
41 | # browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') #执行下拉进度条的脚本
42 | # browser.execute_script('alert("To Bottom")') #下拉到底部之后弹出一段提示
43 | # browser.close()
44 |
45 | #获取节点信息
46 |
47 | #获取属性(前提是先选中这个节点)
48 | from selenium import webdriver
49 | from selenium.webdriver import ActionChains
50 |
51 | # browser=webdriver.Chrome()
52 | # browser.get('http://www.zhihu.com/explore')
53 | # logo=browser.find_element_by_id('zh-top-link-logo') #选中这个logo
54 | # print(logo)
55 | # print(logo.get_attribute('class'))
56 | # browser.close()
57 |
58 | #获取文本值
59 | # from selenium import webdriver
60 | #
61 | # browser=webdriver.PhantomJS()
62 | # browser.get('http://www.zhihu.com/explore')
63 | # input=browser.find_element_by_class_name('zu-top-add-question') #获取提问这个按钮,然后将此按钮的文本内容打印出来
64 | # print(input.text)
65 | #
66 | # #获取ID,标签名
67 | # print(input.id)
68 | # print(input.location)
69 | # print(input.tag_name)
70 | # print(input.size)
71 |
72 | #切换frame
73 | # import time
74 | # from selenium import webdriver
75 | # from selenium.common.exceptions import NoSuchElementException
76 | #
77 | # browser=webdriver.Chrome()
78 | # url='http://www.runoob.com/try/try.php?filenanme=jqueryui-api-droppable'
79 | # browser.get(url)
80 | # browser.switch_to.frame('iframeResult') #切换至子级Frame里面,然后尝试获取父级里面的logo,找不到的话,抛出异常
81 | # try:
82 | # logo=browser.find_element_by_class_name('logo')
83 | # except NoSuchElementException:
84 | # print('No Logo!')
85 | # browser.switch_to.parent_frame() #切换至父级Frame里面,再次获取节点
86 | # logo=browser.find_element_by_class_name('logo')
87 | # print(logo)
88 | # print(logo.text)
89 | # browser.close()
90 |
91 | #延时等待(显式等待,隐式等待)
92 | #隐式等待,超出等待时间后,将抛出超时异常
93 | # from selenium import webdriver
94 | # browser=webdriver.Chrome()
95 | # browser.implicitly_wait(10)
96 | # browser.get('http://www.zhihu.com/explore')
97 | # input=browser.find_element_by_class_name('zu-top-add-question')
98 | # print(input)
99 | # browser.close()
100 |
101 | #显式等待,指定要查找的节点,然后之指定一个最长等待时间,如果规定时间加载出来,就返回查找的节点,否则就抛出异常
102 | # from selenium import webdriver
103 | # from selenium.webdriver.common.by import By
104 | # from selenium.webdriver.support.ui import WebDriverWait
105 | # from selenium.webdriver.support import expected_conditions as EC
106 | #
107 | # browser=webdriver.Chrome()
108 | # browser.get('http://www.taobao.com')
109 | # wait=WebDriverWait(browser,10)
110 | # input=wait.until(EC.presence_of_element_located((By.ID,'q'))) #传入条件,表示等待节点出现,下面的表示节点按钮可点击
111 | # button=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.btn-search')))
112 | # print(input,button)
113 | # browser.close()
114 |
115 | #前进和后退(back(),forward())
116 | # import time
117 | # from selenium import webdriver
118 | #
119 | # browser=webdriver.Chrome()
120 | # browser.get('http://www.baidu.com/')
121 | # browser.get('http://www.taobao.com/')
122 | # # browser.get('http://www.python.org/')
123 | # browser.back()
124 | # time.sleep(1)
125 | # browser.forward()
126 | # browser.close()
127 |
128 |
129 | #cookies
130 | # from selenium import webdriver
131 | #
132 | # browser=webdriver.Chrome()
133 | # browser.get('http://www.zhihu.com/explore')
134 | # print(browser.get_cookies())
135 | # browser.add_cookie({'name':'name','domain':'www.zhihu.com','value':'job'})
136 | # print(browser.get_cookies())
137 | # browser.delete_all_cookies()
138 | # print(browser.get_cookies())
139 | # browser.close()
140 |
141 | #选项卡操作
142 | import time
143 | from selenium import webdriver
144 |
145 | browser=webdriver.Chrome()
146 | browser.get('https://www.baidu.com')
147 | browser.execute_script('window.open()') #新开启一个选项卡
148 | print(browser.window_handles) #获取当前所有开启的选项卡的代号列表
149 | browser.switch_to.window(browser.window_handles[1]) #切换选项卡
150 | browser.get('https://www.taobao.com') #在新开启的选项卡里面打开一个网页
151 | time.sleep(1)
152 | browser.switch_to.window(browser.window_handles[0]) #切回到原来的选项卡
153 | browser.get('https://www.python.org') #执行操作
154 | browser.close()
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/14-splash使用.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 |
4 | #利用render.html获取经过JavaScript渲染之后的页面与源代码
5 | 'curl http://localhost:8050/render.html?url=https://www.baidu.com'
6 |
7 |
8 |
9 | #利用render.har获取页面加载的HAR数据,返回的是json数据
10 | 'curl http://localhost:8050/render.har?url=https://www.jd.com&wait=5'
11 |
12 |
13 |
14 | #获取京东首页渲染完成之后的页面截图,并将其保存在本地(调用了splash的API render.png)
15 | url='http://localhost:8050/render.png?url=https://www,jd.com&wait=5&width=1000&height=700'
16 | response=requests.get(url)
17 | with open('taobao.png','wb') as f:
18 | f.write(response.content)
19 |
20 |
21 |
22 | #此接口包含前面借口的所有功能,返回json数据格式render.json
23 | 'curl http://localhost:8050/render.json?url=https://httpbin.org'
24 |
25 |
26 |
27 | #此接口用于实现与Splash Lua脚本的对接(execute)
28 | #先实现一个Lua一个最简单的脚本
29 | # function main(splash)
30 | # return 'Hello'
31 | # end
32 | #然后将此脚本转化为URL编码的字符串,拼接到execute接口后面
33 | 'curl http://localhost:8050/execute?lua_source=function+main(.*?)end'
34 | #运行结果,输出Hello,利用python实现
35 | import requests
36 | from urllib.parse import quote
37 | lua='''
38 | function main(splash)
39 | return 'Hello'
40 | end
41 | '''
42 | url='http://localhost:8050/execucte?lua_source='+quote(lua)
43 | response=requests.get(url)
44 | print(response.text)
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/15-使用selenium爬取淘宝商品.py:
--------------------------------------------------------------------------------
1 | '''
2 | 在之前,我们已经会分析Ajax请求来抓取相关数据,但是并不是所有页面都可以通过分析Ajax来完成抓取。比如,淘宝
3 | 它的页面的确也是通过Ajax获取的,但是这些Ajax比较复杂,可能会包含加密密钥,所以想通过自己构造Ajax参数,还
4 | 时比较困难的,最快速的方法还是selenium。本节中,我们就通过selenium来模拟浏览器操作,抓取淘宝商品信息,并
5 | 将其存储在Mongodb中。
6 | '''
7 | from selenium import webdriver
8 | from selenium.common.exceptions import TimeoutException
9 | from selenium.webdriver.common.by import By
10 | from selenium.webdriver.support import expected_conditions as EC
11 | from selenium.webdriver.support.wait import WebDriverWait
12 | from urllib.parse import quote
13 | from pyquery import PyQuery as pq
14 | from multiprocessing import Pool
15 | import pymongo
16 |
17 | chrome_options=webdriver.ChromeOptions()
18 | chrome_options.add_argument('--headless')
19 | browser=webdriver.Chrome(chrome_options=chrome_options) #设置chrome的无头模式(headless)
20 | # browser=webdriver.Chrome()
21 | wait=WebDriverWait(browser,10)
22 | KEYWORD='ipad'
23 | MONGO_URL='localhost'
24 | MONGO_DB='TAOBAO'
25 | MONGO_COLLECTION='products'
26 | client=pymongo.MongoClient(MONGO_URL)
27 | db=client[MONGO_DB]
28 |
29 |
30 | def index_page(page): #定义一个获取索引页信息的函数
31 | print('正在爬取第',page,'页')
32 | try:
33 | url='https://s.taobao.com/search?q='+quote(KEYWORD)
34 | browser.get(url)
35 | if page>1:
36 | input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input')))
37 | submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
38 | input.clear()
39 | input.send_keys(page)
40 | submit.click()
41 | wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page)))
42 | wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item')))
43 | get_products()
44 | except TimeoutException:
45 | index_page(page)
46 |
47 |
48 | def get_products(): #解析获取到的索引页的信息,将商品的信息从中提取出来
49 | html=browser.page_source
50 | doc=pq(html)
51 | items=doc('#mainsrp-itemlist .items .item').items()
52 | for item in items:
53 | product= {
54 | 'image': item.find('.pic .img').attr('data-src'),
55 | 'price': item.find('.price').text(),
56 | 'deal': item.find('.deal-cnt').text(),
57 | 'title': item.find('.title').text(),
58 | 'shop': item.find('.shop').text(),
59 | 'location': item.find('.location').text()
60 | }
61 | print(product)
62 | # save_to_mongo(product)
63 |
64 | def save_to_mongo(result): #定义一个存储到MONGODB数据库的方法
65 | try:
66 | if db[MONGO_COLLECTION].insert(result):
67 | print('存储到Mongodb成功!')
68 | except Exception:
69 | print('存储到Mongodb失败!')
70 |
71 | MAX_PAGE=100
72 | def main(): #实现页码的遍历
73 | for i in range(1,MAX_PAGE+1):
74 | index_page(i)
75 |
76 | if __name__=='__main__':
77 | main()
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/16-中国知网验证码识别/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/16-中国知网验证码识别/__init__.py
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/16-中国知网验证码识别/captcha1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/16-中国知网验证码识别/captcha1.png
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/16-中国知网验证码识别/captcha2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/16-中国知网验证码识别/captcha2.png
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/16-中国知网验证码识别/code.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/16-中国知网验证码识别/code.jpg
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/16-中国知网验证码识别/极验验证码.py:
--------------------------------------------------------------------------------
1 | '''
2 | 极验验证码特点:首先点击按钮进行智能验证,如果验证不通过,则会弹出滑动验证的窗口,拖动滑块拼合图像进行验证,之后生成三个加密
3 | 参数,通过表单提交到后台,后台还会进行一次验证。
4 | 识别验证需要三步:
5 | 1.模拟点击验证按钮
6 | 2.识别滑动缺口的位置
7 | 3.模拟拖动滑块
8 | '''
9 | import time
10 | from io import BytesIO
11 | from PIL import Image
12 | from selenium import webdriver
13 | from selenium.webdriver import ActionChains
14 | from selenium.webdriver.common.by import By
15 | from selenium.webdriver.support.ui import WebDriverWait
16 | from selenium.webdriver.support import expected_conditions as EC
17 |
18 | EMAIL = 'coolcooljob@163.com'
19 | PASSWORD = 'zb748491'
20 | BORDER = 6
21 | INIT_LEFT = 60
22 |
23 |
24 | class CrackGeetest():
25 | def __init__(self):
26 | self.url = 'https://account.geetest.com/login'
27 | self.browser = webdriver.Chrome()
28 | self.wait = WebDriverWait(self.browser, 20)
29 | self.email = EMAIL
30 | self.password = PASSWORD
31 |
32 | def __del__(self):
33 | self.browser.close()
34 |
35 | def get_geetest_button(self):
36 | """
37 | 获取初始验证按钮
38 | :return:
39 | """
40 | button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_radar_tip')))
41 | return button
42 |
43 | def get_position(self):
44 | """
45 | 获取验证码位置
46 | :return: 验证码位置元组
47 | """
48 | img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_canvas_img')))
49 | time.sleep(2)
50 | location = img.location
51 | size = img.size
52 | top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
53 | 'width']
54 | return (top, bottom, left, right)
55 |
56 | def get_screenshot(self):
57 | """
58 | 获取网页截图
59 | :return: 截图对象
60 | """
61 | screenshot = self.browser.get_screenshot_as_png()
62 | screenshot = Image.open(BytesIO(screenshot))
63 | return screenshot
64 |
65 | def get_slider(self):
66 | """
67 | 获取滑块
68 | :return: 滑块对象
69 | """
70 | slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_slider_button')))
71 | return slider
72 |
73 | def get_geetest_image(self, name='captcha.png'):
74 | """
75 | 获取验证码图片
76 | :return: 图片对象
77 | """
78 | top, bottom, left, right = self.get_position()
79 | print('验证码位置', top, bottom, left, right)
80 | screenshot = self.get_screenshot()
81 | captcha = screenshot.crop((left, top, right, bottom))
82 | captcha.save(name)
83 | return captcha
84 |
85 | def open(self):
86 | """
87 | 打开网页输入用户名密码
88 | :return: None
89 | """
90 | self.browser.get(self.url)
91 | email = self.wait.until(EC.presence_of_element_located((By.ID, 'email')))
92 | password = self.wait.until(EC.presence_of_element_located((By.ID, 'password')))
93 | email.send_keys(self.email)
94 | password.send_keys(self.password)
95 |
96 | def get_gap(self, image1, image2):
97 | """
98 | 获取缺口偏移量
99 | :param image1: 不带缺口图片
100 | :param image2: 带缺口图片
101 | :return:
102 | """
103 | left = 60
104 | for i in range(left, image1.size[0]):
105 | for j in range(image1.size[1]):
106 | if not self.is_pixel_equal(image1, image2, i, j):
107 | left = i
108 | return left
109 | return left
110 |
111 | def is_pixel_equal(self, image1, image2, x, y):
112 | """
113 | 判断两个像素是否相同
114 | :param image1: 图片1
115 | :param image2: 图片2
116 | :param x: 位置x
117 | :param y: 位置y
118 | :return: 像素是否相同
119 | """
120 | # 取两个图片的像素点
121 | pixel1 = image1.load()[x, y]
122 | pixel2 = image2.load()[x, y]
123 | threshold = 60
124 | if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(
125 | pixel1[2] - pixel2[2]) < threshold:
126 | return True
127 | else:
128 | return False
129 |
130 | def get_track(self, distance):
131 | """
132 | 根据偏移量获取移动轨迹
133 | :param distance: 偏移量
134 | :return: 移动轨迹
135 | """
136 | # 移动轨迹
137 | track = []
138 | # 当前位移
139 | current = 0
140 | # 减速阈值
141 | mid = distance * 4 / 5
142 | # 计算间隔
143 | t = 0.2
144 | # 初速度
145 | v = 0
146 |
147 | while current < distance:
148 | if current < mid:
149 | # 加速度为正2
150 | a = 2
151 | else:
152 | # 加速度为负3
153 | a = -3
154 | # 初速度v0
155 | v0 = v
156 | # 当前速度v = v0 + at
157 | v = v0 + a * t
158 | # 移动距离x = v0t + 1/2 * a * t^2
159 | move = v0 * t + 1 / 2 * a * t * t
160 | # 当前位移
161 | current += move
162 | # 加入轨迹
163 | track.append(round(move))
164 | return track
165 |
166 | def move_to_gap(self, slider, track):
167 | """
168 | 拖动滑块到缺口处
169 | :param slider: 滑块
170 | :param track: 轨迹
171 | :return:
172 | """
173 | ActionChains(self.browser).click_and_hold(slider).perform()
174 | for x in track:
175 | ActionChains(self.browser).move_by_offset(xoffset=x, yoffset=0).perform()
176 | time.sleep(0.5)
177 | ActionChains(self.browser).release().perform()
178 |
179 | def login(self):
180 | """
181 | 登录
182 | :return: None
183 | """
184 | submit = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'login-btn')))
185 | submit.click()
186 | time.sleep(10)
187 | print('登录成功')
188 |
189 | def crack(self):
190 | # 输入用户名密码
191 | self.open()
192 | # 点击验证按钮
193 | button = self.get_geetest_button()
194 | button.click()
195 | # 获取验证码图片
196 | image1 = self.get_geetest_image('captcha1.png')
197 | # 点按呼出缺口
198 | slider = self.get_slider()
199 | slider.click()
200 | # 获取带缺口的验证码图片
201 | image2 = self.get_geetest_image('captcha2.png')
202 | # 获取缺口位置
203 | gap = self.get_gap(image1, image2)
204 | print('缺口位置', gap)
205 | # 减去缺口位移
206 | gap -= BORDER
207 | # 获取移动轨迹
208 | track = self.get_track(gap)
209 | print('滑动轨迹', track)
210 | # 拖动滑块
211 | self.move_to_gap(slider, track)
212 |
213 | success = self.wait.until(
214 | EC.text_to_be_present_in_element((By.CLASS_NAME, 'geetest_success_radar_tip_content'), '验证成功'))
215 | print(success)
216 |
217 | # 失败后重试
218 | if not success:
219 | self.crack()
220 | else:
221 | self.login()
222 |
223 |
224 | if __name__ == '__main__':
225 | crack = CrackGeetest()
226 | crack.crack()
227 |
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/16-中国知网验证码识别/验证码识别.py:
--------------------------------------------------------------------------------
1 | import tesserocr
2 | from PIL import Image
3 |
4 |
5 | #当识别得到验证码结果和实际图片内容有差异时,可以试着将图片处理一下,比如转灰度值,二值化,此外,还可以指定二值化的阈值
6 |
7 | # image=Image.open('code.jpg')
8 | # result=tesserort.image_to_text(image)
9 | # print(result)
10 |
11 | #直接将图片转化为文字
12 | # print(tesserocr.file_to_text('code.png'))
13 |
14 | #若图片内容转化有差异,可做如下处理
15 | # image=Image.open('code.jpg')
16 | # image=image.convert('L') #将图片转化为灰度图像
17 | # image.show()
18 | # image=image.convert('1')
19 | # image.show() #将图片进行二值化处理
20 |
21 | image=Image.convert('L')
22 | threshold=80 #设置二值化阈值
23 | table=[]
24 | for i in range(256):
25 | if i span').text()
30 | if domain:
31 | page_url = 'https:' + domain
32 | self.crawl(page_url, callback=self.domain_page)
33 |
34 | def domain_page(self, response):
35 | name = response.doc('.mm-p-model-info-left-top dd > a').text()
36 | dir_path = self.deal.mkDir(name)
37 | brief = response.doc('.mm-aixiu-content').text()
38 | if dir_path:
39 | imgs = response.doc('.mm-aixiu-content img').items()
40 | count = 1
41 | self.deal.saveBrief(brief, dir_path, name)
42 | for img in imgs:
43 | url = img.attr.src
44 | if url:
45 | extension = self.deal.getExtension(url)
46 | file_name = name + str(count) + '.' + extension
47 | count += 1
48 | self.crawl(img.attr.src, callback=self.save_img,
49 | save={'dir_path': dir_path, 'file_name': file_name})
50 |
51 | def save_img(self, response):
52 | content = response.content
53 | dir_path = response.save['dir_path']
54 | file_name = response.save['file_name']
55 | file_path = dir_path + '/' + file_name
56 | self.deal.saveImg(content, file_path)
57 |
58 |
59 | import os
60 |
61 | class Deal:
62 | def __init__(self):
63 | self.path = DIR_PATH
64 | if not self.path.endswith('/'):
65 | self.path = self.path + '/'
66 | if not os.path.exists(self.path):
67 | os.makedirs(self.path)
68 |
69 | def mkDir(self, path):
70 | path = path.strip()
71 | dir_path = self.path + path
72 | exists = os.path.exists(dir_path)
73 | if not exists:
74 | os.makedirs(dir_path)
75 | return dir_path
76 | else:
77 | return dir_path
78 |
79 | def saveImg(self, content, path):
80 | f = open(path, 'wb')
81 | f.write(content)
82 | f.close()
83 |
84 | def saveBrief(self, content, dir_path, name):
85 | file_name = dir_path + "/" + name + ".txt"
86 | f = open(file_name, "w+")
87 | f.write(content.encode('utf-8'))
88 |
89 | def getExtension(self, url):
90 | extension = url.split('.')[-1]
91 | return extension
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/19-TaoBaoMM/data/result.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/19-TaoBaoMM/data/result.db
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/20-代理的使用.py:
--------------------------------------------------------------------------------
1 | #urllib
2 | #输出结果是一个JSON,它有一个字段origin,表明了客户端的IP。
3 | from urllib.error import URLError
4 | from urllib.request import ProxyHandler,build_opener
5 |
6 | proxy=''
7 | # proxy='username:password@' #若碰到需要认证的代理,可以进行这样的设置。
8 | proxy_handler=ProxyHandler({
9 | 'http':'http://'+proxy,
10 | 'https':'https://'+proxy,
11 | })
12 | opener=build_opener(proxy_handler)
13 | try:
14 | response=opener.open('http://httpbin.org/get')
15 | print(response.read().decode('utf-8'))
16 | except URLError as e:
17 | print(e.reason)
18 |
19 | #requests
20 | import requests
21 |
22 | proxy=''
23 | proxy='username:password@' #有用户验证的情况
24 | proxies={
25 | 'http':'http://'+proxy,
26 | 'https':'https://'+proxy,
27 | }
28 | try:
29 | response=requests.get('http://httpbin.org/get',proxies=proxies)
30 | print(response.text)
31 | except requests.exceptions.ConnectionError as e:
32 | print('Error',e.args)
33 |
34 |
35 | #selenium(有界面:Chrome,无界面:Plantomjs)
36 | #Chrome
37 | from selenium import webdriver
38 |
39 | proxy=''
40 | chrome_options=webdriver.ChromeOptions()
41 | chrome_options.add_argument('--proxy-server=http://'+proxy)
42 | browser=webdriver.Chrome(chrome_options=chrome_options)
43 | browser.get('http://httpbin.org/get')
44 |
45 | #plantomjs
46 | from selenium import webdriver
47 |
48 | service_args=[
49 | '--proxy=',
50 | '--proxy-type=http'
51 | ]
52 | # service_args=[
53 | # '--proxy=127.0.0.1:9743',
54 | # '--proxy-type=http',
55 | # '--proxy-auth=username:password'
56 | # ] #需要进行认证设置的时候
57 | browser=webdriver.PhantomJS(service_args=service_args)
58 | browser.get('http://httpbin.org/get')
59 | print(browser.page_source)
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/3.jpg
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data.csv:
--------------------------------------------------------------------------------
1 | id name age
2 | 1 job 20
3 | 2 jack 22
4 |
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data/project.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/project.db
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data/result.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/result.db
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data/scheduler.1d:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/scheduler.1d
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data/scheduler.1h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/scheduler.1h
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data/scheduler.all:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/scheduler.all
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data/task.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/task.db
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/data1.csv:
--------------------------------------------------------------------------------
1 | id,name,age
2 | 100,job,22
3 | 101,tom,32
4 | 102,mary,25
5 |
--------------------------------------------------------------------------------
/python3网络爬虫实战练习/ghostdriver.log:
--------------------------------------------------------------------------------
1 | [INFO - 2018-05-19T03:48:03.597Z] GhostDriver - Main - running on port 52052
2 | [INFO - 2018-05-19T03:48:08.043Z] Session [716db230-5b17-11e8-aebb-055f3c8bc10e] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true}
3 | [INFO - 2018-05-19T03:48:08.043Z] Session [716db230-5b17-11e8-aebb-055f3c8bc10e] - page.customHeaders: - {}
4 | [INFO - 2018-05-19T03:48:08.043Z] Session [716db230-5b17-11e8-aebb-055f3c8bc10e] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-10-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}}
5 | [INFO - 2018-05-19T03:48:08.044Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: 716db230-5b17-11e8-aebb-055f3c8bc10e
6 |
--------------------------------------------------------------------------------