├── README.md └── python3网络爬虫实战练习 ├── 01-flask测试.py ├── 02-tornado测试.py ├── 03-BeautifulSoup.py ├── 04-pyquery.py ├── 05-数据存储.py ├── 06-json数据格式.py ├── 07-csv文件.py ├── 08-关系型数据库存储.py ├── 09-redis连接.py ├── 10-利用python模拟Ajax请求爬取微博详情.py ├── 11-分析Ajax爬取今日头条街拍美图.py ├── 12-selenium的使用.py ├── 13-selenium使用2.py ├── 14-splash使用.py ├── 15-使用selenium爬取淘宝商品.py ├── 16-中国知网验证码识别 ├── __init__.py ├── captcha1.png ├── captcha2.png ├── code.jpg ├── 极验验证码.py └── 验证码识别.py ├── 16-爬取虎牙直播主播信息.py ├── 17-借助在线验证码识别平台完成点出验证码的识别 ├── Chaojiying.py └── 点触验证码的识别.py ├── 19-TaoBaoMM ├── TaoBaoMM.py └── data │ └── result.db ├── 20-代理的使用.py ├── 3.jpg ├── data.csv ├── data ├── project.db ├── result.db ├── scheduler.1d ├── scheduler.1h ├── scheduler.all └── task.db ├── data1.csv └── ghostdriver.log /README.md: -------------------------------------------------------------------------------- 1 | # Python3WebSpider-Test 2 | Python3网络爬虫实战练习 3 | -------------------------------------------------------------------------------- /python3网络爬虫实战练习/01-flask测试.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | 3 | app=Flask(__name__) 4 | 5 | @app.route("/") 6 | def hello(): 7 | return 'Hello World!' 8 | 9 | 10 | if __name__=='__main__': 11 | app.run() -------------------------------------------------------------------------------- /python3网络爬虫实战练习/02-tornado测试.py: -------------------------------------------------------------------------------- 1 | import tornado.ioloop 2 | import tornado.web 3 | 4 | class MainHandler(tornado.web.RequestHandler): 5 | def get(self): 6 | self.write('Hello World!') 7 | 8 | def make_app(): 9 | return tornado.web.Application([(r"/",MainHandler),]) 10 | 11 | if __name__=='__main__': 12 | app=make_app() 13 | app.listen(8888) 14 | tornado.ioloop.IOLoop.current().start() -------------------------------------------------------------------------------- /python3网络爬虫实战练习/03-BeautifulSoup.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup #利用BeautifulSoup将网页的代码按照标准的缩进打印出来 3 | import re 4 | 5 | # response=requests.get('http://www.baidu.com') 6 | # response.encoding='utf-8' 7 | # print(response.text) 8 | 9 | # 基本用法 10 | html=requests.get('https://www.baidu.com') 11 | soup=BeautifulSoup(html.content,'lxml') #注意这里的html.content,不能直接传入html 12 | print(soup.prettify()) 13 | print(soup.title.string) 14 | 15 | # 选择元素 16 | print(soup.title) 17 | print(soup.p) 18 | 19 | # 提取信息 20 | print(soup.title.name) # 获取名称 21 | print(soup.p.attrs) # 获取属性 22 | print(soup.p.attrs['name']) 23 | 24 | # 获取内容 25 | print(soup.p.string) 26 | 27 | # 嵌套选择 28 | print(soup.head.title) 29 | print(type(soup.head.title)) 30 | print(soup.head.title.string) 31 | 32 | # 关联选择 33 | # 子节点和子孙节点 34 | print(soup.p.contents) # 获取p节点的直接子节点(以列表形式返回),即子孙节点 35 | print(soup.p.children) # 返回结果是生成器类型,接下来用for循环 36 | for i,child in enumerate(soup.p.children): 37 | print(i,child) 38 | 39 | print(soup.p.descendants) # 获取子孙节点 40 | for i,child in enumerate(soup.p.descendants): 41 | print(i,child) 42 | 43 | # 父节点和祖先节点 44 | print(soup.a.parent) # 获取a节点的直接父节点 45 | print(soup.a.parents) # 获取a节点的祖先节点 46 | 47 | # 兄弟节点 48 | print(soup.a.next_sibling) 49 | print(soup.a.previous_sibling) 50 | 51 | # 提取信息 52 | print(soup.a.next_sibling.string) 53 | print(soup.a.previous_sibling.text) 54 | 55 | # 方法选择器 56 | # find_all(name,attrs,recursive,text,**kwargs) 57 | print(soup.find_all('ul')) 58 | for ul in soup.find_all(name='ul'): 59 | print(ul.find_all(name='li')) 60 | 61 | print(soup.find_all(attrs={'name':'list-1'})) # 返回所有匹配的元素组成的列表 62 | print(soup.find_all(id='list-1')) 63 | 64 | print(soup.find_all(text=re.compile('link'))) 65 | 66 | print(soup.find(name='ul')) # 返回第一个匹配的元素 67 | 68 | # CSS选择器 69 | # print(soup.select('ul li')) 70 | # print(soup.select(#list-2 .element)) 71 | # print(soup.select('.panel .panel-heading')) 72 | 73 | # 获取属性 74 | for ul in soup.select('ul'): 75 | print(ul['id']) 76 | print(ul.attrs['id']) 77 | 78 | # 获取文本 79 | for li in soup.select('li'): 80 | print(li.get_text()) 81 | print(li.string) 82 | print(li.string) -------------------------------------------------------------------------------- /python3网络爬虫实战练习/04-pyquery.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery as pq 2 | import requests 3 | 4 | # URL初始化 5 | doc=pq(url='https://www.cuiqingcai.com') #请求url,得到结果 6 | doc=pq(requests.get('https://www.cuiqingcai.com').text) #等价于上面那句 7 | print(doc('title')) 8 | 9 | # 文件初始化 10 | doc=pq(filename=demo.html) 11 | print(doc('li')) 12 | 13 | # 基本CSS选择器 14 | html=''' 15 | 16 | ''' 17 | doc=pq(html) 18 | print(doc('#container .list li')) 19 | print(type(doc('#container .list li'))) 20 | 21 | # 查找节点 22 | # 子节点 23 | html=''' 24 | 25 | ''' 26 | doc=pq(html) 27 | items=doc('.list') 28 | print(type(items)) 29 | print(items) 30 | lis=items.find('li') # 查找所有的子孙节点 31 | # lis=items.children() # 查找所有的子节点 32 | # lis=items.children('.active') # 查找字节点中class为active的节点 33 | print(type(lis)) 34 | print(lis) 35 | 36 | 37 | # 父节点 38 | doc=pq(html) 39 | items=doc('.list') 40 | container=items.parent() # 查找父节点 41 | # container=items.parents() # 查找祖先节点 42 | print(type(container)) 43 | print(container) 44 | 45 | # # 兄弟节点 46 | doc=pq(html) 47 | li=doc('.list .item-0.active') # 选取class为list的节点内部class为item-0和active的节点 48 | print(li.siblings()) # 查找兄弟节点 49 | 50 | 51 | # 遍历(多个节点),调用items方法 52 | doc=pq(html) 53 | lis=doc('li').items() 54 | print(type(lis)) 55 | for li in lis: 56 | print(li,type(li)) 57 | 58 | # 获取信息 59 | # 获取属性 60 | doc=pq(html) 61 | a=doc('.item-0.active') 62 | print(a,type(a)) 63 | print(a.attr('href')) # 与下面等价 64 | # print(a.attr.hrer()) 65 | 66 | # 获取多个属性,需要遍历才能实现 67 | doc=pq(html) 68 | a=doc('a') 69 | for i in a.items(): 70 | print(a.attr('href')) 71 | 72 | # 获取文本(调用text()方法实现) 73 | doc=pq(html) 74 | a=doc('.item-0.active a') 75 | print(a) 76 | print(a.text()) # 去掉节点的包括的所有HTML,只返回纯文字内容 77 | 78 | # 获取HTML内容 79 | doc=pq(html) 80 | li=doc('.item-0.active') 81 | print(li) 82 | print(li.html) 83 | 84 | # 如果选取的是多个节点,text()或html()返回的是第一个li节点的内部的HTML文本,而text()返回的是所有的li节点内部的纯文本 85 | 86 | # 节点操作,比如为某个节点移除或者增添一个class 87 | doc=pq(html) 88 | li=doc('.item-0.active') 89 | print(li) 90 | li.remove_class('active') 91 | print(li) 92 | li.add_class('active') 93 | 94 | # attr,text和html(改变属性,文本,以及html) 95 | doc=pq(html) 96 | li=doc('.item-0.active') 97 | li.attr('name','link') 98 | print(li) 99 | li.text('changed item') 100 | print(li) 101 | li.html('changed item') 102 | print(li) 103 | 104 | 105 | # remove() 106 | # wrap.find('p').remove() 107 | # print(wrip.text()) 108 | 109 | # 伪类选择器 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /python3网络爬虫实战练习/05-数据存储.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from pyquery import PyQuery as pq 3 | 4 | url='https://www.zhihu.com/' 5 | headers={ 6 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36' 7 | } 8 | html=requests.get(url=url,headers=headers).text 9 | doc=pq(html) 10 | # print(doc) 11 | items=doc('.explore-tab .feed-item').items() 12 | print(items) 13 | for item in items: 14 | question=item.find('h2').text() 15 | print('1') 16 | author=item.find('.author-link-line').text() 17 | answer=pq(item.find('.content').html()).text() 18 | with open('explore.txt','a') as f: 19 | f.write('\n'.join([question,author,answer])) 20 | f.write('\n'+'='*50+'\n') 21 | f.close() 22 | 23 | 24 | # response2 = requests.get('https://p4.ssl.cdn.btime.com/t01f7081c44b722510b.jpg') 25 | # 26 | # with open('3.jpg','wb') as f: 27 | # f.write(response2.content) 28 | # f.close() 29 | -------------------------------------------------------------------------------- /python3网络爬虫实战练习/06-json数据格式.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | #json.loads()可以将文本字符串转化为json对象,可以是列表,也可以使是符串 4 | #json.dumps()可以将json对象转化为文本字符串 5 | #json的数据需要用双引号来包围,不能使用单引号,否则loads()方法会解析失败 6 | 7 | str='''[{ 8 | "name":"job", #注意数据必须是以双引号包围,不然json.loads()则会出现解析错误 9 | "age":"20", 10 | "gender":"boy" 11 | },{ 12 | "name":"jack", 13 | "age":"22", 14 | "gender":"boy" 15 | }]''' 16 | print(type(str)) 17 | data=json.loads(str) 18 | print(type(data)) 19 | print(data) 20 | -------------------------------------------------------------------------------- /python3网络爬虫实战练习/07-csv文件.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pandas as pd #利用pandas读取csv数据文件 3 | 4 | #csv,其文件以纯文本的形式存储表格数据,相当于一个结构化表的纯文本形式 5 | #写入 6 | with open('data.csv','w') as csvfile: #打开csv文件,获得文件句柄 7 | writer=csv.writer(csvfile) #初始化传入对象,传入该句柄 8 | writer=csv.writer(csvfile,delimiter=' ') #可以修改数据之间的分隔符,默认是逗号 9 | writer.writerow(['id','name','age']) #以writerow()传入每行的数据 10 | writer.writerow(['1','job','20']) 11 | writer.writerow(['2','jack','22']) 12 | 13 | 14 | #写入字典格式的数据 15 | with open('data1.csv','w',encoding='utf-8') as csvfile: 16 | fieldnames=['id','name','age'] #先定义三个字段,用filenames表示 17 | writer=csv.DictWriter(csvfile,fieldnames=fieldnames) #将字段传入给Dictwriter来初始化一个字典写入对象 18 | writer.writeheader() #写入头信息 19 | writer.writerow({'id':'100','name':'job','age':22}) #传入相应字段 20 | writer.writerow({'id':'101','name':'tom','age':32}) 21 | writer.writerow({'id':'102','name':'mary','age':25}) 22 | 23 | #读取 24 | with open('data1.csv','r',encoding='utf-8') as csvfile: 25 | reader=csv.reader(csvfile) 26 | for row in reader: 27 | print(row) 28 | 29 | 30 | 31 | df=pd.read_csv('data1.csv') 32 | print(df) 33 | -------------------------------------------------------------------------------- /python3网络爬虫实战练习/08-关系型数据库存储.py: -------------------------------------------------------------------------------- 1 | #连接数据库 2 | import pymysql 3 | 4 | # db=pymysql.connect(host='localhost',user='root',password='748491',port=3306) 5 | # cursor=db.cursor() 6 | # cursor.execute('select version()') 7 | # data=cursor.fetchone() 8 | # print('database version:',data) 9 | # cursor.execute('create database spiders default character set utf8') 10 | # db.close() 11 | 12 | #创建表 13 | id='101' 14 | user='job' 15 | age='22' 16 | db=pymysql.connect(host='localhost',user='root',password='748491',port=3306,db='spiders') 17 | #sql='create table if not exists students (id varchar(255) not null,name varchar(255) not null,age int not null ,primary key(id))' 18 | #注意传入数据的时候尽量使用这种格式化符的形式,有几个value写几个%s,只需要在execute()方法的第一个参数传入该sql语句,value值统一传一个元组就好了 19 | sql='insert into students(id,name,age) values(%s,%s,%s)' 20 | cursor=db.cursor() 21 | # cursor.execute(sql1) 22 | try: 23 | cursor.execute(sql,(id,user,age)) 24 | db.commit() #必须执行commit()方法才能将数据插入数据库中 25 | except: 26 | db.rollback() #异常处理,如果插入数据失败,就执行一次回滚,相当于什么都没有发生过 27 | db.close() 28 | -------------------------------------------------------------------------------- /python3网络爬虫实战练习/09-redis连接.py: -------------------------------------------------------------------------------- 1 | from redis import StrictRedis,ConnectionPool 2 | 3 | 4 | redis=StrictRedis(host='localhost',port=6379,db=0,password=None) #两种连接方式都可以 5 | # pool=ConnectionPool(host='localhost',port=6379,db=0,password=None) 6 | # redis=StrictRedis(connection_pool=pool) 7 | redis.set('name','job') 8 | redis.set('age',22) 9 | print(redis.get('name')) 10 | print(redis.exists('name')) 11 | redis.delete('age') 12 | print(type('name')) 13 | print(redis.keys('n*')) 14 | print(redis.randomkey()) 15 | redis.move('name',2) 16 | # redis.flushdb() #删除当前选择数据库中的所有的键 17 | # redis.flushall() #删除所有数据库中的所有的键 18 | print(redis.dbsize()) #获取当前数据库中的所有键的数目 19 | -------------------------------------------------------------------------------- /python3网络爬虫实战练习/10-利用python模拟Ajax请求爬取微博详情.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from urllib.parse import urlencode 3 | from pyquery import PyQuery as pq 4 | from pymongo import MongoClient 5 | 6 | client=MongoClient() #连接moongodb数据库,定义一些需要用到的变量 7 | db=client['weibo'] 8 | collection=db['weibo'] 9 | 10 | base_url='' 11 | headers={ 12 | 'Host: m.weibo.cn', 13 | 'Referer: https://m.weibo.cn/', 14 | 'User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36', 15 | 'X-Requested-With: XMLHttpRequest' 16 | } 17 | 18 | def get_page(page): #定义一个获取ajax请求内容的函数,结果返回json数据格式 19 | params={ 20 | 'type':'uid', 21 | 'value':'', 22 | 'contained':'', 23 | 'page':page 24 | } 25 | url=base_url+urlencode(params) 26 | try: 27 | response=requests.get(url,headers=headers) 28 | if response.status_code==200: 29 | return response.json() 30 | except requests.ConnectionError as e: 31 | print('Error',e.args) 32 | 33 | def parse_page(json): #解析返回的json数据,并返回一个字典,包括微博的id,正文,赞数,评论数以及转发数 34 | if json: 35 | items=json.get('data').get('cards') 36 | for item in items: 37 | item=item.get('mblog') 38 | weibo={} 39 | weibo['id']=item.get() 40 | weibo['text']=pq(item.get('text')).text() #借助pquery将正文中的html去掉 41 | weibo['attitudes']=item.get('attitudes_count') 42 | weibo['comments']=item.get('comments_count') 43 | weibo['reposts']=item.get('reposts_count') 44 | yield weibo 45 | 46 | def save_to_mongo(result): #定义一个将数据存到mongodb数据库的方法 47 | if collection.insert(result): 48 | print('Save to Mongo!') 49 | 50 | if __name__=='__main__': 51 | for page in range(1,11): #遍历一下page,一共10页,将提取到的结果打印出来 52 | json=get_page(page) 53 | results=parse_page(json) 54 | for result in results: 55 | print(result) 56 | save_to_mongo(result) -------------------------------------------------------------------------------- /python3网络爬虫实战练习/11-分析Ajax爬取今日头条街拍美图.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from urllib.parse import urlencode 3 | import os 4 | from hashlib import md5 5 | from multiprocessing.pool import Pool 6 | from redis import StrictRedis 7 | 8 | 9 | redis=StrictRedis(host='localhost',port=6379,db=0,password=None) 10 | 11 | def get_one_page(offset): #实现方法来加载单个Ajax请求的结果,返回json的字符串格式 12 | params={ 13 | 'offset':offset, 14 | 'format':'json', 15 | 'keyword':'街拍', 16 | 'autoload':'true', 17 | 'count':'20', 18 | 'cur_tab':'1', 19 | } 20 | url='https://www.toutiao.com/search_content/?'+urlencode(params) 21 | try: 22 | response=requests.get(url) 23 | if response.status_code==200: 24 | return response.json() 25 | except requests.ConnectionError as e: 26 | return None 27 | 28 | def get_images(json): #实现一个解析方法,提取每条数据的image_list字段中的每一张图片的链接,将图片链接和图片所属的标题一并返回,可以构造一个生成器 29 | if json.get('data'): 30 | for item in json.get('data'): 31 | title=item.get('title') 32 | images=item.get('image_list') 33 | if images: 34 | for image in images: 35 | yield{ 36 | 'image':'http:'+image.get('url'), 37 | 'title':title, 38 | } 39 | else: 40 | return None 41 | #定义一个保存图片的方法,首先根据item的title来创建文件夹,然后请求这个图片链接,获取图片的二进制数据,以二进制的形式写 42 | #入文件。图片的名称可以使用其内容的MD5值,这样可以去除重复。 43 | def save_image(item): 44 | if not os.path.exists(item.get('title')): 45 | os.mkdir(item.get('title')) 46 | try: 47 | response=requests.get(item.get('image')) 48 | if response.status_code==200: 49 | file_path='{0}/{1}{2}'.format(item.get('title'),md5(response.content).hexdigest(),'.jpg') 50 | if not os.path.exists(file_path): 51 | with open(file_path,'wb') as f: 52 | f.write(response.content) 53 | else: 54 | print('Already Downloaded',file_path) 55 | # save_to_redis(f) 56 | except requests.ConnectionError: 57 | print('Failed to Save Image!') 58 | except requests.exceptions.MissingSchema as rem: 59 | print(rem) 60 | 61 | # def save_to_redis(f): #定义一个将数据存到mongodb数据库的方法 62 | # if redis.insert(f): 63 | # print('Save to Redid!') 64 | 65 | 66 | def main(offset): 67 | json=get_one_page(offset) 68 | for item in get_images(json): 69 | print(item) 70 | save_image(item) 71 | 72 | 73 | GROUP_START=1 74 | GROUP_END=20 75 | if __name__=='__main__': 76 | pool=Pool() 77 | groups=([x*20 for x in range(GROUP_START,GROUP_END+1)]) 78 | pool.map(main,groups) #利用多线程的线程池,调用其map方法实现多线程下载 79 | pool.close() 80 | pool.join() 81 | 82 | -------------------------------------------------------------------------------- /python3网络爬虫实战练习/12-selenium的使用.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.common.keys import Keys 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium.webdriver.support.wait import WebDriverWait 6 | 7 | 8 | #通过selenium来驱动浏览器加载网页的话,可以直接拿到JavaScript渲染的结果了,不用担心使用的是什么加密结果 9 | browser=webdriver.Chrome() 10 | try: 11 | browser.get('https://www.baidu.com') 12 | # input_=browser.find_elements_by_id('kw') 13 | # input_.send_keys('Python') 14 | # input_.send_keys(Keys.ENTER) 15 | # wait=WebDriverWait(browser,10) 16 | # wait.until(EC.presence_of_element_located((By.ID,'content-left'))) 17 | print(browser.current_url) 18 | print(browser.get_cookies()) 19 | print(browser.page_source) 20 | finally: 21 | browser.close() 22 | 23 | 24 | -------------------------------------------------------------------------------- /python3网络爬虫实战练习/13-selenium使用2.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time 3 | from selenium.webdriver import ActionChains #动作链 4 | 5 | #查找节点并访问页面 6 | # browser=webdriver.Chrome() 7 | # browser.get('http://www.taobao.com') 8 | # lis=browser.find_elements_by_css_selector('body > div.screen-outer.clearfix > div.main > div.main-inner.clearfix > div.tbh-service.J_Module > div > ul > li:nth-child(3) > a:nth-child(1)') 9 | # print(lis) 10 | # browser.close() 11 | 12 | #节点交互 13 | # browser=webdriver.Chrome() 14 | # browser.get('https://www.taobao.com') 15 | # input=browser.find_element_by_id('q') 16 | # input.send_keys('iphone') 17 | # time.sleep(3) 18 | # input.clear() 19 | # input.send_keys('iPad') 20 | # button=browser.find_element_by_class_name('btn-search') 21 | # button.click() 22 | # time.sleep(3) 23 | # browser.close() 24 | 25 | #动作链,实现一个节点的拖拽 26 | # browser=webdriver.Chrome() 27 | # url='' 28 | # browser.get(url) 29 | # browser.switch_to.frame('') 30 | # source=browser.find_element_by_css_selector('#draggeable') 31 | # target=browser.find_element_by_css_selector('#droppable') 32 | # actions=ActionChains(browser) 33 | # actions.drag_and_drop(source,target) 34 | # actions.perform() 35 | 36 | #执行JavaScript(比如下拉进度条) 37 | # from selenium import webdriver 38 | # 39 | # browser=webdriver.Chrome() 40 | # browser.get('http://www.zhihu.com/explore') 41 | # browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') #执行下拉进度条的脚本 42 | # browser.execute_script('alert("To Bottom")') #下拉到底部之后弹出一段提示 43 | # browser.close() 44 | 45 | #获取节点信息 46 | 47 | #获取属性(前提是先选中这个节点) 48 | from selenium import webdriver 49 | from selenium.webdriver import ActionChains 50 | 51 | # browser=webdriver.Chrome() 52 | # browser.get('http://www.zhihu.com/explore') 53 | # logo=browser.find_element_by_id('zh-top-link-logo') #选中这个logo 54 | # print(logo) 55 | # print(logo.get_attribute('class')) 56 | # browser.close() 57 | 58 | #获取文本值 59 | # from selenium import webdriver 60 | # 61 | # browser=webdriver.PhantomJS() 62 | # browser.get('http://www.zhihu.com/explore') 63 | # input=browser.find_element_by_class_name('zu-top-add-question') #获取提问这个按钮,然后将此按钮的文本内容打印出来 64 | # print(input.text) 65 | # 66 | # #获取ID,标签名 67 | # print(input.id) 68 | # print(input.location) 69 | # print(input.tag_name) 70 | # print(input.size) 71 | 72 | #切换frame 73 | # import time 74 | # from selenium import webdriver 75 | # from selenium.common.exceptions import NoSuchElementException 76 | # 77 | # browser=webdriver.Chrome() 78 | # url='http://www.runoob.com/try/try.php?filenanme=jqueryui-api-droppable' 79 | # browser.get(url) 80 | # browser.switch_to.frame('iframeResult') #切换至子级Frame里面,然后尝试获取父级里面的logo,找不到的话,抛出异常 81 | # try: 82 | # logo=browser.find_element_by_class_name('logo') 83 | # except NoSuchElementException: 84 | # print('No Logo!') 85 | # browser.switch_to.parent_frame() #切换至父级Frame里面,再次获取节点 86 | # logo=browser.find_element_by_class_name('logo') 87 | # print(logo) 88 | # print(logo.text) 89 | # browser.close() 90 | 91 | #延时等待(显式等待,隐式等待) 92 | #隐式等待,超出等待时间后,将抛出超时异常 93 | # from selenium import webdriver 94 | # browser=webdriver.Chrome() 95 | # browser.implicitly_wait(10) 96 | # browser.get('http://www.zhihu.com/explore') 97 | # input=browser.find_element_by_class_name('zu-top-add-question') 98 | # print(input) 99 | # browser.close() 100 | 101 | #显式等待,指定要查找的节点,然后之指定一个最长等待时间,如果规定时间加载出来,就返回查找的节点,否则就抛出异常 102 | # from selenium import webdriver 103 | # from selenium.webdriver.common.by import By 104 | # from selenium.webdriver.support.ui import WebDriverWait 105 | # from selenium.webdriver.support import expected_conditions as EC 106 | # 107 | # browser=webdriver.Chrome() 108 | # browser.get('http://www.taobao.com') 109 | # wait=WebDriverWait(browser,10) 110 | # input=wait.until(EC.presence_of_element_located((By.ID,'q'))) #传入条件,表示等待节点出现,下面的表示节点按钮可点击 111 | # button=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.btn-search'))) 112 | # print(input,button) 113 | # browser.close() 114 | 115 | #前进和后退(back(),forward()) 116 | # import time 117 | # from selenium import webdriver 118 | # 119 | # browser=webdriver.Chrome() 120 | # browser.get('http://www.baidu.com/') 121 | # browser.get('http://www.taobao.com/') 122 | # # browser.get('http://www.python.org/') 123 | # browser.back() 124 | # time.sleep(1) 125 | # browser.forward() 126 | # browser.close() 127 | 128 | 129 | #cookies 130 | # from selenium import webdriver 131 | # 132 | # browser=webdriver.Chrome() 133 | # browser.get('http://www.zhihu.com/explore') 134 | # print(browser.get_cookies()) 135 | # browser.add_cookie({'name':'name','domain':'www.zhihu.com','value':'job'}) 136 | # print(browser.get_cookies()) 137 | # browser.delete_all_cookies() 138 | # print(browser.get_cookies()) 139 | # browser.close() 140 | 141 | #选项卡操作 142 | import time 143 | from selenium import webdriver 144 | 145 | browser=webdriver.Chrome() 146 | browser.get('https://www.baidu.com') 147 | browser.execute_script('window.open()') #新开启一个选项卡 148 | print(browser.window_handles) #获取当前所有开启的选项卡的代号列表 149 | browser.switch_to.window(browser.window_handles[1]) #切换选项卡 150 | browser.get('https://www.taobao.com') #在新开启的选项卡里面打开一个网页 151 | time.sleep(1) 152 | browser.switch_to.window(browser.window_handles[0]) #切回到原来的选项卡 153 | browser.get('https://www.python.org') #执行操作 154 | browser.close() -------------------------------------------------------------------------------- /python3网络爬虫实战练习/14-splash使用.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | #利用render.html获取经过JavaScript渲染之后的页面与源代码 5 | 'curl http://localhost:8050/render.html?url=https://www.baidu.com' 6 | 7 | 8 | 9 | #利用render.har获取页面加载的HAR数据,返回的是json数据 10 | 'curl http://localhost:8050/render.har?url=https://www.jd.com&wait=5' 11 | 12 | 13 | 14 | #获取京东首页渲染完成之后的页面截图,并将其保存在本地(调用了splash的API render.png) 15 | url='http://localhost:8050/render.png?url=https://www,jd.com&wait=5&width=1000&height=700' 16 | response=requests.get(url) 17 | with open('taobao.png','wb') as f: 18 | f.write(response.content) 19 | 20 | 21 | 22 | #此接口包含前面借口的所有功能,返回json数据格式render.json 23 | 'curl http://localhost:8050/render.json?url=https://httpbin.org' 24 | 25 | 26 | 27 | #此接口用于实现与Splash Lua脚本的对接(execute) 28 | #先实现一个Lua一个最简单的脚本 29 | # function main(splash) 30 | # return 'Hello' 31 | # end 32 | #然后将此脚本转化为URL编码的字符串,拼接到execute接口后面 33 | 'curl http://localhost:8050/execute?lua_source=function+main(.*?)end' 34 | #运行结果,输出Hello,利用python实现 35 | import requests 36 | from urllib.parse import quote 37 | lua=''' 38 | function main(splash) 39 | return 'Hello' 40 | end 41 | ''' 42 | url='http://localhost:8050/execucte?lua_source='+quote(lua) 43 | response=requests.get(url) 44 | print(response.text) -------------------------------------------------------------------------------- /python3网络爬虫实战练习/15-使用selenium爬取淘宝商品.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 在之前,我们已经会分析Ajax请求来抓取相关数据,但是并不是所有页面都可以通过分析Ajax来完成抓取。比如,淘宝 3 | 它的页面的确也是通过Ajax获取的,但是这些Ajax比较复杂,可能会包含加密密钥,所以想通过自己构造Ajax参数,还 4 | 时比较困难的,最快速的方法还是selenium。本节中,我们就通过selenium来模拟浏览器操作,抓取淘宝商品信息,并 5 | 将其存储在Mongodb中。 6 | ''' 7 | from selenium import webdriver 8 | from selenium.common.exceptions import TimeoutException 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support import expected_conditions as EC 11 | from selenium.webdriver.support.wait import WebDriverWait 12 | from urllib.parse import quote 13 | from pyquery import PyQuery as pq 14 | from multiprocessing import Pool 15 | import pymongo 16 | 17 | chrome_options=webdriver.ChromeOptions() 18 | chrome_options.add_argument('--headless') 19 | browser=webdriver.Chrome(chrome_options=chrome_options) #设置chrome的无头模式(headless) 20 | # browser=webdriver.Chrome() 21 | wait=WebDriverWait(browser,10) 22 | KEYWORD='ipad' 23 | MONGO_URL='localhost' 24 | MONGO_DB='TAOBAO' 25 | MONGO_COLLECTION='products' 26 | client=pymongo.MongoClient(MONGO_URL) 27 | db=client[MONGO_DB] 28 | 29 | 30 | def index_page(page): #定义一个获取索引页信息的函数 31 | print('正在爬取第',page,'页') 32 | try: 33 | url='https://s.taobao.com/search?q='+quote(KEYWORD) 34 | browser.get(url) 35 | if page>1: 36 | input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input'))) 37 | submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) 38 | input.clear() 39 | input.send_keys(page) 40 | submit.click() 41 | wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page))) 42 | wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item'))) 43 | get_products() 44 | except TimeoutException: 45 | index_page(page) 46 | 47 | 48 | def get_products(): #解析获取到的索引页的信息,将商品的信息从中提取出来 49 | html=browser.page_source 50 | doc=pq(html) 51 | items=doc('#mainsrp-itemlist .items .item').items() 52 | for item in items: 53 | product= { 54 | 'image': item.find('.pic .img').attr('data-src'), 55 | 'price': item.find('.price').text(), 56 | 'deal': item.find('.deal-cnt').text(), 57 | 'title': item.find('.title').text(), 58 | 'shop': item.find('.shop').text(), 59 | 'location': item.find('.location').text() 60 | } 61 | print(product) 62 | # save_to_mongo(product) 63 | 64 | def save_to_mongo(result): #定义一个存储到MONGODB数据库的方法 65 | try: 66 | if db[MONGO_COLLECTION].insert(result): 67 | print('存储到Mongodb成功!') 68 | except Exception: 69 | print('存储到Mongodb失败!') 70 | 71 | MAX_PAGE=100 72 | def main(): #实现页码的遍历 73 | for i in range(1,MAX_PAGE+1): 74 | index_page(i) 75 | 76 | if __name__=='__main__': 77 | main() 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /python3网络爬虫实战练习/16-中国知网验证码识别/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/16-中国知网验证码识别/__init__.py -------------------------------------------------------------------------------- /python3网络爬虫实战练习/16-中国知网验证码识别/captcha1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/16-中国知网验证码识别/captcha1.png -------------------------------------------------------------------------------- /python3网络爬虫实战练习/16-中国知网验证码识别/captcha2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/16-中国知网验证码识别/captcha2.png -------------------------------------------------------------------------------- /python3网络爬虫实战练习/16-中国知网验证码识别/code.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/16-中国知网验证码识别/code.jpg -------------------------------------------------------------------------------- /python3网络爬虫实战练习/16-中国知网验证码识别/极验验证码.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 极验验证码特点:首先点击按钮进行智能验证,如果验证不通过,则会弹出滑动验证的窗口,拖动滑块拼合图像进行验证,之后生成三个加密 3 | 参数,通过表单提交到后台,后台还会进行一次验证。 4 | 识别验证需要三步: 5 | 1.模拟点击验证按钮 6 | 2.识别滑动缺口的位置 7 | 3.模拟拖动滑块 8 | ''' 9 | import time 10 | from io import BytesIO 11 | from PIL import Image 12 | from selenium import webdriver 13 | from selenium.webdriver import ActionChains 14 | from selenium.webdriver.common.by import By 15 | from selenium.webdriver.support.ui import WebDriverWait 16 | from selenium.webdriver.support import expected_conditions as EC 17 | 18 | EMAIL = 'coolcooljob@163.com' 19 | PASSWORD = 'zb748491' 20 | BORDER = 6 21 | INIT_LEFT = 60 22 | 23 | 24 | class CrackGeetest(): 25 | def __init__(self): 26 | self.url = 'https://account.geetest.com/login' 27 | self.browser = webdriver.Chrome() 28 | self.wait = WebDriverWait(self.browser, 20) 29 | self.email = EMAIL 30 | self.password = PASSWORD 31 | 32 | def __del__(self): 33 | self.browser.close() 34 | 35 | def get_geetest_button(self): 36 | """ 37 | 获取初始验证按钮 38 | :return: 39 | """ 40 | button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_radar_tip'))) 41 | return button 42 | 43 | def get_position(self): 44 | """ 45 | 获取验证码位置 46 | :return: 验证码位置元组 47 | """ 48 | img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_canvas_img'))) 49 | time.sleep(2) 50 | location = img.location 51 | size = img.size 52 | top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[ 53 | 'width'] 54 | return (top, bottom, left, right) 55 | 56 | def get_screenshot(self): 57 | """ 58 | 获取网页截图 59 | :return: 截图对象 60 | """ 61 | screenshot = self.browser.get_screenshot_as_png() 62 | screenshot = Image.open(BytesIO(screenshot)) 63 | return screenshot 64 | 65 | def get_slider(self): 66 | """ 67 | 获取滑块 68 | :return: 滑块对象 69 | """ 70 | slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_slider_button'))) 71 | return slider 72 | 73 | def get_geetest_image(self, name='captcha.png'): 74 | """ 75 | 获取验证码图片 76 | :return: 图片对象 77 | """ 78 | top, bottom, left, right = self.get_position() 79 | print('验证码位置', top, bottom, left, right) 80 | screenshot = self.get_screenshot() 81 | captcha = screenshot.crop((left, top, right, bottom)) 82 | captcha.save(name) 83 | return captcha 84 | 85 | def open(self): 86 | """ 87 | 打开网页输入用户名密码 88 | :return: None 89 | """ 90 | self.browser.get(self.url) 91 | email = self.wait.until(EC.presence_of_element_located((By.ID, 'email'))) 92 | password = self.wait.until(EC.presence_of_element_located((By.ID, 'password'))) 93 | email.send_keys(self.email) 94 | password.send_keys(self.password) 95 | 96 | def get_gap(self, image1, image2): 97 | """ 98 | 获取缺口偏移量 99 | :param image1: 不带缺口图片 100 | :param image2: 带缺口图片 101 | :return: 102 | """ 103 | left = 60 104 | for i in range(left, image1.size[0]): 105 | for j in range(image1.size[1]): 106 | if not self.is_pixel_equal(image1, image2, i, j): 107 | left = i 108 | return left 109 | return left 110 | 111 | def is_pixel_equal(self, image1, image2, x, y): 112 | """ 113 | 判断两个像素是否相同 114 | :param image1: 图片1 115 | :param image2: 图片2 116 | :param x: 位置x 117 | :param y: 位置y 118 | :return: 像素是否相同 119 | """ 120 | # 取两个图片的像素点 121 | pixel1 = image1.load()[x, y] 122 | pixel2 = image2.load()[x, y] 123 | threshold = 60 124 | if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs( 125 | pixel1[2] - pixel2[2]) < threshold: 126 | return True 127 | else: 128 | return False 129 | 130 | def get_track(self, distance): 131 | """ 132 | 根据偏移量获取移动轨迹 133 | :param distance: 偏移量 134 | :return: 移动轨迹 135 | """ 136 | # 移动轨迹 137 | track = [] 138 | # 当前位移 139 | current = 0 140 | # 减速阈值 141 | mid = distance * 4 / 5 142 | # 计算间隔 143 | t = 0.2 144 | # 初速度 145 | v = 0 146 | 147 | while current < distance: 148 | if current < mid: 149 | # 加速度为正2 150 | a = 2 151 | else: 152 | # 加速度为负3 153 | a = -3 154 | # 初速度v0 155 | v0 = v 156 | # 当前速度v = v0 + at 157 | v = v0 + a * t 158 | # 移动距离x = v0t + 1/2 * a * t^2 159 | move = v0 * t + 1 / 2 * a * t * t 160 | # 当前位移 161 | current += move 162 | # 加入轨迹 163 | track.append(round(move)) 164 | return track 165 | 166 | def move_to_gap(self, slider, track): 167 | """ 168 | 拖动滑块到缺口处 169 | :param slider: 滑块 170 | :param track: 轨迹 171 | :return: 172 | """ 173 | ActionChains(self.browser).click_and_hold(slider).perform() 174 | for x in track: 175 | ActionChains(self.browser).move_by_offset(xoffset=x, yoffset=0).perform() 176 | time.sleep(0.5) 177 | ActionChains(self.browser).release().perform() 178 | 179 | def login(self): 180 | """ 181 | 登录 182 | :return: None 183 | """ 184 | submit = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'login-btn'))) 185 | submit.click() 186 | time.sleep(10) 187 | print('登录成功') 188 | 189 | def crack(self): 190 | # 输入用户名密码 191 | self.open() 192 | # 点击验证按钮 193 | button = self.get_geetest_button() 194 | button.click() 195 | # 获取验证码图片 196 | image1 = self.get_geetest_image('captcha1.png') 197 | # 点按呼出缺口 198 | slider = self.get_slider() 199 | slider.click() 200 | # 获取带缺口的验证码图片 201 | image2 = self.get_geetest_image('captcha2.png') 202 | # 获取缺口位置 203 | gap = self.get_gap(image1, image2) 204 | print('缺口位置', gap) 205 | # 减去缺口位移 206 | gap -= BORDER 207 | # 获取移动轨迹 208 | track = self.get_track(gap) 209 | print('滑动轨迹', track) 210 | # 拖动滑块 211 | self.move_to_gap(slider, track) 212 | 213 | success = self.wait.until( 214 | EC.text_to_be_present_in_element((By.CLASS_NAME, 'geetest_success_radar_tip_content'), '验证成功')) 215 | print(success) 216 | 217 | # 失败后重试 218 | if not success: 219 | self.crack() 220 | else: 221 | self.login() 222 | 223 | 224 | if __name__ == '__main__': 225 | crack = CrackGeetest() 226 | crack.crack() 227 | -------------------------------------------------------------------------------- /python3网络爬虫实战练习/16-中国知网验证码识别/验证码识别.py: -------------------------------------------------------------------------------- 1 | import tesserocr 2 | from PIL import Image 3 | 4 | 5 | #当识别得到验证码结果和实际图片内容有差异时,可以试着将图片处理一下,比如转灰度值,二值化,此外,还可以指定二值化的阈值 6 | 7 | # image=Image.open('code.jpg') 8 | # result=tesserort.image_to_text(image) 9 | # print(result) 10 | 11 | #直接将图片转化为文字 12 | # print(tesserocr.file_to_text('code.png')) 13 | 14 | #若图片内容转化有差异,可做如下处理 15 | # image=Image.open('code.jpg') 16 | # image=image.convert('L') #将图片转化为灰度图像 17 | # image.show() 18 | # image=image.convert('1') 19 | # image.show() #将图片进行二值化处理 20 | 21 | image=Image.convert('L') 22 | threshold=80 #设置二值化阈值 23 | table=[] 24 | for i in range(256): 25 | if i span').text() 30 | if domain: 31 | page_url = 'https:' + domain 32 | self.crawl(page_url, callback=self.domain_page) 33 | 34 | def domain_page(self, response): 35 | name = response.doc('.mm-p-model-info-left-top dd > a').text() 36 | dir_path = self.deal.mkDir(name) 37 | brief = response.doc('.mm-aixiu-content').text() 38 | if dir_path: 39 | imgs = response.doc('.mm-aixiu-content img').items() 40 | count = 1 41 | self.deal.saveBrief(brief, dir_path, name) 42 | for img in imgs: 43 | url = img.attr.src 44 | if url: 45 | extension = self.deal.getExtension(url) 46 | file_name = name + str(count) + '.' + extension 47 | count += 1 48 | self.crawl(img.attr.src, callback=self.save_img, 49 | save={'dir_path': dir_path, 'file_name': file_name}) 50 | 51 | def save_img(self, response): 52 | content = response.content 53 | dir_path = response.save['dir_path'] 54 | file_name = response.save['file_name'] 55 | file_path = dir_path + '/' + file_name 56 | self.deal.saveImg(content, file_path) 57 | 58 | 59 | import os 60 | 61 | class Deal: 62 | def __init__(self): 63 | self.path = DIR_PATH 64 | if not self.path.endswith('/'): 65 | self.path = self.path + '/' 66 | if not os.path.exists(self.path): 67 | os.makedirs(self.path) 68 | 69 | def mkDir(self, path): 70 | path = path.strip() 71 | dir_path = self.path + path 72 | exists = os.path.exists(dir_path) 73 | if not exists: 74 | os.makedirs(dir_path) 75 | return dir_path 76 | else: 77 | return dir_path 78 | 79 | def saveImg(self, content, path): 80 | f = open(path, 'wb') 81 | f.write(content) 82 | f.close() 83 | 84 | def saveBrief(self, content, dir_path, name): 85 | file_name = dir_path + "/" + name + ".txt" 86 | f = open(file_name, "w+") 87 | f.write(content.encode('utf-8')) 88 | 89 | def getExtension(self, url): 90 | extension = url.split('.')[-1] 91 | return extension -------------------------------------------------------------------------------- /python3网络爬虫实战练习/19-TaoBaoMM/data/result.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/19-TaoBaoMM/data/result.db -------------------------------------------------------------------------------- /python3网络爬虫实战练习/20-代理的使用.py: -------------------------------------------------------------------------------- 1 | #urllib 2 | #输出结果是一个JSON,它有一个字段origin,表明了客户端的IP。 3 | from urllib.error import URLError 4 | from urllib.request import ProxyHandler,build_opener 5 | 6 | proxy='' 7 | # proxy='username:password@' #若碰到需要认证的代理,可以进行这样的设置。 8 | proxy_handler=ProxyHandler({ 9 | 'http':'http://'+proxy, 10 | 'https':'https://'+proxy, 11 | }) 12 | opener=build_opener(proxy_handler) 13 | try: 14 | response=opener.open('http://httpbin.org/get') 15 | print(response.read().decode('utf-8')) 16 | except URLError as e: 17 | print(e.reason) 18 | 19 | #requests 20 | import requests 21 | 22 | proxy='' 23 | proxy='username:password@' #有用户验证的情况 24 | proxies={ 25 | 'http':'http://'+proxy, 26 | 'https':'https://'+proxy, 27 | } 28 | try: 29 | response=requests.get('http://httpbin.org/get',proxies=proxies) 30 | print(response.text) 31 | except requests.exceptions.ConnectionError as e: 32 | print('Error',e.args) 33 | 34 | 35 | #selenium(有界面:Chrome,无界面:Plantomjs) 36 | #Chrome 37 | from selenium import webdriver 38 | 39 | proxy='' 40 | chrome_options=webdriver.ChromeOptions() 41 | chrome_options.add_argument('--proxy-server=http://'+proxy) 42 | browser=webdriver.Chrome(chrome_options=chrome_options) 43 | browser.get('http://httpbin.org/get') 44 | 45 | #plantomjs 46 | from selenium import webdriver 47 | 48 | service_args=[ 49 | '--proxy=', 50 | '--proxy-type=http' 51 | ] 52 | # service_args=[ 53 | # '--proxy=127.0.0.1:9743', 54 | # '--proxy-type=http', 55 | # '--proxy-auth=username:password' 56 | # ] #需要进行认证设置的时候 57 | browser=webdriver.PhantomJS(service_args=service_args) 58 | browser.get('http://httpbin.org/get') 59 | print(browser.page_source) -------------------------------------------------------------------------------- /python3网络爬虫实战练习/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/3.jpg -------------------------------------------------------------------------------- /python3网络爬虫实战练习/data.csv: -------------------------------------------------------------------------------- 1 | id name age 2 | 1 job 20 3 | 2 jack 22 4 | -------------------------------------------------------------------------------- /python3网络爬虫实战练习/data/project.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/project.db -------------------------------------------------------------------------------- /python3网络爬虫实战练习/data/result.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/result.db -------------------------------------------------------------------------------- /python3网络爬虫实战练习/data/scheduler.1d: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/scheduler.1d -------------------------------------------------------------------------------- /python3网络爬虫实战练习/data/scheduler.1h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/scheduler.1h -------------------------------------------------------------------------------- /python3网络爬虫实战练习/data/scheduler.all: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/scheduler.all -------------------------------------------------------------------------------- /python3网络爬虫实战练习/data/task.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolcooljob/Python3WebSpider-Test/b1ff058286fcf10742d08e7c45a74e6f1369708a/python3网络爬虫实战练习/data/task.db -------------------------------------------------------------------------------- /python3网络爬虫实战练习/data1.csv: -------------------------------------------------------------------------------- 1 | id,name,age 2 | 100,job,22 3 | 101,tom,32 4 | 102,mary,25 5 | -------------------------------------------------------------------------------- /python3网络爬虫实战练习/ghostdriver.log: -------------------------------------------------------------------------------- 1 | [INFO - 2018-05-19T03:48:03.597Z] GhostDriver - Main - running on port 52052 2 | [INFO - 2018-05-19T03:48:08.043Z] Session [716db230-5b17-11e8-aebb-055f3c8bc10e] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true} 3 | [INFO - 2018-05-19T03:48:08.043Z] Session [716db230-5b17-11e8-aebb-055f3c8bc10e] - page.customHeaders: - {} 4 | [INFO - 2018-05-19T03:48:08.043Z] Session [716db230-5b17-11e8-aebb-055f3c8bc10e] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-10-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}} 5 | [INFO - 2018-05-19T03:48:08.044Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: 716db230-5b17-11e8-aebb-055f3c8bc10e 6 | --------------------------------------------------------------------------------