├── 10-Requests.py ├── 10.1busPath_Crawler.py ├── 11.1pytesser.py ├── 11.2jTessBoxEditor-tesseract.py ├── 11verification_code.py ├── 12video.py ├── 13.1thread_ood.py ├── 13.2thread_queue.py ├── 13.3Mthread_crawler.py ├── 13multiThread.py ├── 1urllib_base.py ├── 2ajax.py ├── 4handler.py ├── 5.1正则爬取糗.py ├── 5.2正则爬取励志网并建立文章集合页面.py ├── 58crawler ├── 58.ttf └── 58decode.py ├── 6.1read_list.py ├── 6xpath.py ├── 7pictureLoad.py ├── 8jsonpath.py ├── 9.1Chrome-headless.py ├── 9selenium.py ├── README.md ├── chineseUniversityRankCrawler └── RankofNuni.py ├── exe_file ├── 10 │ ├── baidu.html │ ├── bus_line.json │ ├── chinaunix_login.html │ ├── renren.html │ └── set_proxy.html ├── 11 │ ├── code.png │ ├── code1.png │ ├── code_2.png │ ├── gushi.html │ ├── gushiwen_code │ │ ├── gu.bat │ │ ├── gu.traineddata │ │ ├── train_toBox.bat │ │ ├── 第一轮训练.rar │ │ ├── 第三轮训练.rar │ │ ├── 第二轮训练.rar │ │ └── 第四轮训练.rar │ ├── test │ │ └── 0-9A-Z训练字典 │ │ │ └── gu.traineddata │ └── verify_code │ │ └── verify_code.rar ├── 12 │ └── download │ │ └── test.txt ├── 13 │ └── bus_line.json ├── baidu.png ├── book.json ├── chrome-driver │ └── chromedriver.exe ├── douban.html ├── douban.png ├── douban_d.png ├── hello.txt ├── meinv.png ├── python_postion.csv ├── show.png ├── szchina_page_1.html ├── szchina_page_2.html ├── xinggan │ ├── hpic408_s.jpg │ ├── zzpic12973_s.jpg │ ├── zzpic13004_s.jpg │ ├── zzpic13068_s.jpg │ ├── zzpic13087_s.jpg │ ├── zzpic13131_s.jpg │ ├── zzpic13242_s.jpg │ ├── zzpic13256_s.jpg │ ├── zzpic13424_s.jpg │ ├── zzpic13487_s.jpg │ ├── zzpic13589_s.jpg │ ├── zzpic13628_s.jpg │ ├── zzpic13668_s.jpg │ ├── zzpic13710_s.jpg │ ├── zzpic13772_s.jpg │ ├── zzpic13941_s.jpg │ ├── zzpic14042_s.jpg │ ├── zzpic14131_s.jpg │ ├── zzpic14178_s.jpg │ ├── zzpic14185_s.jpg │ ├── zzpic14298_s.jpg │ ├── zzpic14358_s.jpg │ ├── zzpic14425_s.jpg │ ├── zzpic14458_s.jpg │ ├── zzpic14479_s.jpg │ ├── zzpic14568_s.jpg │ ├── zzpic14603_s.jpg │ ├── zzpic14638_s.jpg │ ├── zzpic14802_s.jpg │ ├── zzpic14872_s.jpg │ ├── zzpic14965_s.jpg │ ├── zzpic15059_s.jpg │ ├── zzpic15084_s.jpg │ ├── zzpic15247_s.jpg │ ├── zzpic15324_s.jpg │ ├── zzpic15420_s.jpg │ ├── zzpic15469_s.jpg │ ├── zzpic15567_s.jpg │ ├── zzpic15608_s.jpg │ ├── zzpic15786_s.jpg │ ├── zzpic15891_s.jpg │ ├── zzpic15920_s.jpg │ ├── zzpic16049_s.jpg │ ├── zzpic16135_s.jpg │ ├── zzpic16191_s.jpg │ ├── zzpic16240_s.jpg │ ├── zzpic16394_s.jpg │ ├── zzpic16406_s.jpg │ ├── zzpic16566_s.jpg │ ├── zzpic16638_s.jpg │ ├── zzpic16686_s.jpg │ ├── zzpic16786_s.jpg │ ├── zzpic16807_s.jpg │ ├── zzpic16817_s.jpg │ ├── zzpic16857_s.jpg │ ├── zzpic16889_s.jpg │ ├── zzpic16921_s.jpg │ ├── zzpic16949_s.jpg │ ├── zzpic17052_s.jpg │ ├── zzpic17175_s.jpg │ ├── zzpic17202_s.jpg │ ├── zzpic17322_s.jpg │ ├── zzpic17359_s.jpg │ ├── zzpic17378_s.jpg │ ├── zzpic17442_s.jpg │ ├── zzpic17558_s.jpg │ ├── zzpic17615_s.jpg │ ├── zzpic17727_s.jpg │ ├── zzpic17778_s.jpg │ ├── zzpic17797_s.jpg │ ├── zzpic17879_s.jpg │ ├── zzpic17946_s.jpg │ ├── zzpic18038_s.jpg │ ├── zzpic18089_s.jpg │ ├── zzpic18110_s.jpg │ ├── zzpic18144_s.jpg │ ├── zzpic18308_s.jpg │ ├── zzpic18433_s.jpg │ ├── zzpic18631_s.jpg │ └── zzpic18883_s.jpg └── xpath.html ├── fillder.py ├── meizhuo_crawler.py ├── pictureCrawler ├── PictureDown.py ├── depthPicCrawler.py ├── informationMark.py └── multiPicDown.py ├── tesseract训练模型 ├── 0-9A-Z.png ├── README.md ├── combine.png └── oriCode.png └── zhilianCrawler.py /10-Requests.py: -------------------------------------------------------------------------------- 1 | """ 2 | Requests库:安装 pip install requests 3 | 官方文档: 4 | http://cn.python-requests.org/zh_CN/latest/ 5 | what to do? 6 | 与urllib功能相似 7 | get 请求 8 | 定制头部 -requests.get(url=url,headers=headers,params=data) 9 | 响应对象 10 | r.text 字符串形式查看响应 11 | r.content 字符类型查看响应 12 | r.encoding 查看或者设置编码类型 13 | r.status_code 查看响应状态 14 | r.headers 查看响应头部 15 | r.url 查看请求url 16 | r.json 查看json数据 17 | 18 | post 请求 19 | 必应翻译 20 | requests.post(url=url,headers=headers,data=data) 21 | ajax、get、post 22 | 和上面是一样的 23 | 代理 24 | requests.get(url=url,headers=headers,proxies=proxy) 25 | cookie 26 | 实现人人登陆 27 | 留坑: 28 | 教程中的chinaunix改版并且难以登陆操作,在此跳过 29 | 如有解决方法,请联系我 30 | """ 31 | 32 | import requests 33 | 34 | 35 | # 带头部的Requests应用 36 | url = 'http://www.baidu.com/' 37 | headers = { 38 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 39 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' 40 | 'Chrome/75.0.3770.142 Safari/537.36', 41 | } 42 | request = requests.get(url=url,headers=headers) 43 | 44 | request.encoding = 'utf-8' 45 | # print(request.text) 46 | 47 | # 带参数的get 48 | # https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=中国 49 | ''' 50 | 坑:一开始url用 'http://www.baidu.com/' 这个, 51 | 结果在构造搜索请求时返回的一直是百度首页,尬住,请用下面这个 52 | ''' 53 | url = 'http://www.baidu.com/s?' 54 | data = { 55 | 'ie':'utf-8', 56 | 'wd':'中国' 57 | } 58 | request = requests.get(url=url,headers=headers,params=data) 59 | request.encoding = 'utf-8' 60 | 61 | with open('exe_file/10/baidu.html','wb') as fp: 62 | fp.write(request.content) 63 | 64 | 65 | 66 | 67 | # post请求:必应翻译实战 68 | url = 'https://cn.bing.com/tlookupv3?isVertical=1&&' \ 69 | 'IG=B25CDCC5FE9D4B2EA382D628AFEAFDCD&IID=translator.5028.5' 70 | # 构造表单 71 | data = { 72 | 'from': 'zh-Hans', 73 | 'to': 'en', 74 | 'text': 'compute', 75 | } 76 | """ 77 | request = requests.post(url=url,headers=headers,data=data) 78 | # request.encoding = 'utf-8' 79 | print(request.json()) 80 | 81 | 82 | # 代理的使用 83 | url = 'https://www.baidu.com/s?ie=utf-8&f=8&' \ 84 | 'rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip' 85 | proxy = { 86 | 'http':'http://113.54.153.217:1080' 87 | } 88 | request = requests.get(url=url,headers=headers,proxies=proxy) 89 | request.encoding = 'utf-8' 90 | with open('exe_file/10/set_proxy.html','wb') as fp: 91 | fp.write(request.content) 92 | """ 93 | 94 | """ 95 | # 带cookie登陆 96 | # 创建一个会话session,用于保存cookie信息,后续的请求利用session来发送 97 | session = requests.Session() 98 | url = 'http://www.renren.com/ajaxLogin/login?1=1' 99 | formdata = { 100 | 'email':'15625266605', 101 | 'icode' :'', 102 | 'origURL':'http://www.renren.com/home', 103 | 'domain':'renren.com', 104 | 'key_id':'1', 105 | 'captcha_type': 'web_login', 106 | 'password': '1162c49a98a09a374364c99e2ad203b82211bc9cfdf8411e3b47d3ae268ec869', 107 | 'rkey': '54fa0fe478cb62a6ae1184e8e15c9dbb', 108 | 'f':'http%3A%2F%2Fwww.renren.com%2F969920379', 109 | } 110 | 111 | request = session.post(url=url,headers=headers,data=formdata) 112 | # print(request.text) 113 | # >>>{"code":true,"homeUrl":"http://www.renren.com/home"} 114 | 115 | # 登陆后访问主页 116 | home_url = 'http://www.renren.com/home' 117 | home_page = session.get(url=home_url,headers=headers) 118 | home_page.encoding = 'utf-8' 119 | with open('exe_file/10/renren.html','wb') as fp: 120 | fp.write(home_page.content) 121 | 122 | """ 123 | 124 | -------------------------------------------------------------------------------- /10.1busPath_Crawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from lxml import etree 4 | import time 5 | 6 | # 获取当前时间 7 | localtime = time.asctime( time.localtime(time.time()) ) 8 | 9 | url = 'https://shenzhen.8684.cn' 10 | headers = { 11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 12 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' 13 | 'Chrome/75.0.3770.142 Safari/537.36', 14 | } 15 | result = [] 16 | # 请求指定url的内容 17 | def handle_request(request_url): 18 | try: 19 | request = requests.get(url=request_url,headers=headers) 20 | request.raise_for_status() 21 | request.encoding = request.apparent_encoding 22 | return request.text 23 | except: 24 | print(request_url + ' get failed') 25 | return 'NULL' 26 | 27 | # 首页导航 28 | def parse_navigation(): 29 | content = handle_request(request_url=url) 30 | tree = etree.HTML(content) 31 | 32 | # 获取以数字开头的连接 33 | number_href_list = tree.xpath('//div[@class="bus_kt_r1"]/a/@href') 34 | # 获取以字母开头的连接 35 | char_href_list = tree.xpath('//div[@class="bus_kt_r2"]/a/@href') 36 | # 将爬取的导航链接列表返回 37 | return number_href_list + char_href_list 38 | 39 | # 爬取以1(数字或字符)开头的某条线路的所有公交线 40 | def parse_singlePath(navi_list): 41 | # 遍历上面的列表,依次发送请求,解析内容,获取每一个页面 42 | for navi in navi_list: 43 | path_url = url + navi 44 | print(path_url) 45 | content = handle_request(request_url=path_url) 46 | # 解析内容,获取每一路公交具体的url 47 | parse_specialroute(content) 48 | 49 | pass 50 | 51 | # 获取每一条具体公交线的链接尾缀及名称 52 | def parse_specialroute(content): 53 | tree = etree.HTML(content) 54 | route_infos = tree.xpath('//div[@class="stie_list"]/a') 55 | # print(len(route_infos)) 56 | for route_info in route_infos: 57 | # 该线路的url后缀 58 | route_suffix = route_info.xpath('.//@href')[0] 59 | # 名称 60 | route_name = route_info.xpath('.//@title')[0] 61 | # print(route_suffix,route_name) 62 | # 获取每一条具体公交线路的具体信息 63 | get_specialroute(route_suffix,route_name) 64 | 65 | #获取每一条具体公交线路的具体信息 66 | def get_specialroute(route_suffix,route_name): 67 | # 请求页面 68 | content = handle_request(url+route_suffix) 69 | tree = etree.HTML(content) 70 | # 公交信息的标签位置 71 | bus_basic_infos = tree.xpath('//div[@class="bus_i_content"]')[0] 72 | 73 | # 获取线路名称、运营时间、票价 74 | bus_name = bus_basic_infos.xpath('./div[@class="bus_i_t1"]/h1/text()')[0]\ 75 | .replace(' ','') # 替换掉特殊编码 76 | bus_runtime = bus_basic_infos.xpath('./p[1]/text()')[0].replace('运行时间:','') 77 | bus_fares = bus_basic_infos.xpath('./p[2]/text()')[0].replace('票价信息:','') 78 | bus_company = bus_basic_infos.xpath('./p[3]/a/text()')[0] 79 | bus_update = bus_basic_infos.xpath('./p[4]/text()')[0].replace('最后更新:','') 80 | # print(bus_name) 81 | # print(bus_runtime) 82 | # print(bus_fares) 83 | # print(bus_company) 84 | # print(bus_update) 85 | 86 | # 获取线路站点 87 | ''' 88 | 坑:原本思路是找到//div[@class="bus_line_site"][1](第一个,也就是起点到终点的单程站集) 89 | 下的--div[@class="bus_site_layer"],但是一直找不到,所以最后直接找后者,这时得到的站集 90 | 是来回的,取列表的1/2,可以得到单程站集 91 | 填坑:实际上是"bus_line_site ",得再加一个空格 92 | ''' 93 | bus_line = tree.xpath('//div[@class="bus_site_layer"]') 94 | length = len(bus_line) 95 | bus_line = bus_line[:int(length/2)] 96 | sites = [] 97 | for line in bus_line: 98 | for site in line.xpath('./div'): 99 | sites.append(site.xpath('./a/text()')[0]) 100 | # print(sites) 101 | 102 | bus_data = { 103 | '线路名称' : bus_name, 104 | '运行时间' : bus_runtime, 105 | '票价信息' : bus_fares, 106 | '运营公司' : bus_company, 107 | '更新时间' : bus_update, 108 | '经过站点' : sites, 109 | } 110 | 111 | # 公交线路放入结果中 112 | result.append(bus_data) 113 | 114 | 115 | 116 | def main(): 117 | # 获取导航页全部的线路(数字字母)开头的url 118 | navi_list = parse_navigation() 119 | 120 | # 爬取以某个(数字或字符)开头的某条线路的所有公交线 121 | parse_singlePath(navi_list) 122 | 123 | # 将bus_data 存入一个result列表,构造<"result":result>键值对并存入一个新字典 124 | # 将字典转成json格式并存入json文件 125 | shenzhen_busLine = { 126 | 'json_name' : '深圳公交线路汇总', 127 | 'updatetime' : localtime, 128 | 'results' : result 129 | } 130 | file = open('exe_file/10/bus_line.json','w',encoding='utf-8') 131 | 132 | """ 133 | json.dump() 134 | 把字典转成json串,并自动写入文件中 135 | dump参数是(字典,文件句柄,indent)。indent用于缩进美化json串的 136 | ensure_ascii=False用于写文件时有unicode时用,正常显示出中文来 137 | """ 138 | json.dump(shenzhen_busLine,file,indent=4,ensure_ascii=False) 139 | if __name__ == '__main__': 140 | main() -------------------------------------------------------------------------------- /11.1pytesser.py: -------------------------------------------------------------------------------- 1 | import pytesseract 2 | from PIL import Image 3 | from PIL import ImageEnhance 4 | """ 5 | tesseract 安装及使用 6 | OCR,即Optical Character Recognition,光学字符识别,是指通过扫描字符,然后通过其形状将其翻译成电子文本的过程。 7 | 对于图形验证码来说,它们都是一些不规则的字符,这些字符确实是由字符稍加扭曲变换得到的内容。 8 | 参考: 9 | Windows安装Tesseract-OCR 4.00并配置环境变量:https://segmentfault.com/a/1190000014086067 10 | 图像文字识别(三):Tesseract4.0训练字库,提高正确识别率:https://blog.csdn.net/a745233700/article/details/80175883 11 | PIL可以做很多和图像处理相关的事情: 12 | 图像归档(Image Archives): 13 | PIL非常适合于图像归档以及图像的批处理任务。你可以使用PIL创建缩略图,转换图像格式,打印图像等等。 14 | 图像展示(Image Display): 15 | PIL较新的版本支持包括Tk PhotoImage,BitmapImage还有Windows DIB等接口。PIL支持众多的GUI框架接口,可以用于图像展示。 16 | 图像处理(Image Processing): 17 | PIL包括了基础的图像处理函数,包括对点的处理,使用众多的卷积核(convolution kernels)做过滤(filter),还有颜色空间的转换。 18 | PIL库同样支持图像的大小转换,图像旋转,以及任意的仿射变换。PIL还有一些直方图的方法,允许你展示图像的一些统计特性。 19 | 这个可以用来实现图像的自动对比度增强,还有全局的统计分析等。 20 | 具体参考: 21 | PIL介绍:https://www.cnblogs.com/lyrichu/p/9124504.html 22 | Python图像处理库PIL的ImageEnhance模块介绍:https://blog.csdn.net/icamera0/article/details/50753705 23 | 24 | ***python+tesseract 训练和破解验证码:https://zhuanlan.zhihu.com/p/40178190 25 | ***介绍了命令行的操作形式:超级详细的Tesseract-OCR样本训练方法https://blog.csdn.net/sylsjane/article/details/83751297 26 | ***tesseract v4.0.0 帮助文档解读:https://blog.csdn.net/qq_32674197/article/details/80744783 27 | ****tesseract_ocr训练字库、合并字库:https://www.imooc.com/article/32331 28 | """ 29 | img = Image.open('exe_file/11/code1.png') 30 | print(img) 31 | 32 | img= img.convert('RGB') 33 | # 颜色调到最暗 34 | enhancer = ImageEnhance.Color(img) 35 | enhancer = enhancer.enhance(0) 36 | # 增加亮度 37 | enhancer = ImageEnhance.Brightness(enhancer) 38 | enhancer = enhancer.enhance(4) 39 | # 增加对比度 40 | enhancer = ImageEnhance.Contrast(enhancer) 41 | enhancer = enhancer.enhance(15) 42 | # 增加图片锐度 43 | enhancer = ImageEnhance.Sharpness(enhancer) 44 | img = enhancer.enhance(25) 45 | # img.show() 46 | 47 | # 转成灰度图片 48 | img = img.convert('L') 49 | # img.show() 50 | #二值化处理 51 | threshold = 140 52 | table=[] 53 | for i in range(256): 54 | if i < threshold: 55 | table.append(0) 56 | else: 57 | table.append(1) 58 | out = img.point(table,'1') 59 | out.show() 60 | # img = img.convert('RGB') 61 | # out.save('exe_file/11/gushiwen_code/35.png','png') 62 | 63 | print(pytesseract.image_to_string(out,lang='gu',config='--psm 7')) -------------------------------------------------------------------------------- /11.2jTessBoxEditor-tesseract.py: -------------------------------------------------------------------------------- 1 | """ 2 | 验证码训练脚本 3 | Author:caixiaoxin 4 | date:2019/7/23 5 | """ 6 | from PIL import ImageEnhance 7 | from PIL import Image 8 | import pytesseract 9 | from bs4 import BeautifulSoup 10 | import os 11 | import requests 12 | 13 | headers = { 14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 15 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' 16 | 'Chrome/75.0.3770.142 Safari/537.36', 17 | } 18 | # 根据训练字库识别验证码 19 | def get_varifyCode()->str: 20 | img = Image.open('exe_file/11/code.png') 21 | # print(img) 22 | img = img.convert('RGB') 23 | # 颜色调到最暗 24 | enhancer = ImageEnhance.Color(img) 25 | enhancer = enhancer.enhance(0) 26 | # 增加亮度 27 | enhancer = ImageEnhance.Brightness(enhancer) 28 | enhancer = enhancer.enhance(2) 29 | # 增加对比度 30 | enhancer = ImageEnhance.Contrast(enhancer) 31 | enhancer = enhancer.enhance(8) 32 | # 增加图片锐度 33 | enhancer = ImageEnhance.Sharpness(enhancer) 34 | img = enhancer.enhance(20) 35 | # img.show() 36 | 37 | # 转成灰度图片 38 | img = img.convert('L') 39 | # img.show() 40 | # 二值化处理 41 | threshold = 140 42 | table = [] 43 | for i in range(256): 44 | if i < threshold: 45 | table.append(0) 46 | else: 47 | table.append(1) 48 | out = img.point(table, '1') 49 | # out.show() 50 | # img = img.convert('RGB') 51 | out.save('exe_file/11/code.png','png') 52 | code = pytesseract.image_to_string(out,lang='gu',config='--psm 7') 53 | code = code.replace(' ','') # 除去空格 54 | return code 55 | 56 | # 下载验证码 57 | def download_code(session): 58 | url = 'https://so.gushiwen.org/user/login.aspx?' \ 59 | 'from=http://so.gushiwen.org/user/collect.aspx' 60 | request = session.get(url=url, headers=headers) 61 | soup = BeautifulSoup(request.text,'lxml') 62 | 63 | ''' 64 | 问题:url相同,为什么每次获取的验证码不同 65 | 同个url下,通过cookie随机生成验证码 66 | 所以需要在获取验证码,登陆这个过程需要建立会话 67 | ''' 68 | img_src = 'https://so.gushiwen.org' + \ 69 | soup.find('img',id='imgCode')['src'] 70 | # print(img_src) 71 | img = session.get(url=img_src,headers=headers) 72 | with open('exe_file/11/code.png','wb') as fp: 73 | fp.write(img.content) 74 | 75 | # 查找表单需要的两个参数 76 | __VIEWSTATE = soup.find('input', id='__VIEWSTATE')['value'] 77 | __VIEWSTATEGENERATOR = soup.find('input', id='__VIEWSTATEGENERATOR')['value'] 78 | 79 | # 识别验证码 80 | code = get_varifyCode() 81 | 82 | return __VIEWSTATE, __VIEWSTATEGENERATOR, code 83 | 84 | # post登陆 85 | def login(__VIEWSTATE, __VIEWSTATEGENERATOR, code, session)->bool: 86 | post_url = 'https://so.gushiwen.org/user/login.aspx?' \ 87 | 'from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx' 88 | data = { 89 | '__VIEWSTATE' : __VIEWSTATE, 90 | '__VIEWSTATEGENERATOR' : __VIEWSTATEGENERATOR, 91 | 'from' : 'http://so.gushiwen.org/user/collect.aspx', 92 | 'email' : '15625266605', 93 | 'pwd' : '123456', 94 | 'code' : code, 95 | 'denglu': '登录', 96 | } 97 | # 登陆 98 | request = session.post(url=post_url,headers=headers,data=data) 99 | # print(len(request.text)) 100 | if len(request.text)==35822: 101 | return False 102 | else: 103 | return True 104 | # 实现模拟登陆,如果验证码识别错误,将有误验证码存入 105 | def test_login()->bool: 106 | # 创建会话 107 | session = requests.Session() 108 | # 下载验证码到本地 109 | __VIEWSTATE, __VIEWSTATEGENERATOR, code = download_code(session) 110 | 111 | status = login(__VIEWSTATE, __VIEWSTATEGENERATOR, code ,session) 112 | 113 | if status is not True: 114 | try: 115 | img = Image.open('exe_file/11/code.png') 116 | img.save('exe_file/11/verify_code/{}.png'.format(code), 'png') 117 | except OSError: 118 | pass 119 | return False 120 | else: return True 121 | 122 | # 批量处理验证码图片 123 | def deal_img(): 124 | root = 'exe_file/11/gushiwen_code/' 125 | ind = 0 126 | # 从100张图片中提取出字符样本 127 | for image in os.listdir(root): 128 | img = Image.open(root + image) 129 | img = img.convert('RGB') 130 | # 颜色调到最暗 131 | enhancer = ImageEnhance.Color(img) 132 | enhancer = enhancer.enhance(0) 133 | # 增加亮度 134 | enhancer = ImageEnhance.Brightness(enhancer) 135 | enhancer = enhancer.enhance(2) 136 | # 增加对比度 137 | enhancer = ImageEnhance.Contrast(enhancer) 138 | enhancer = enhancer.enhance(8) 139 | # 增加图片锐度 140 | enhancer = ImageEnhance.Sharpness(enhancer) 141 | img = enhancer.enhance(20) 142 | # img.show() 143 | 144 | # 转成灰度图片 145 | img = img.convert('L') 146 | # img.show() 147 | # 二值化处理 148 | threshold = 140 149 | table = [] 150 | for i in range(256): 151 | if i < threshold: 152 | table.append(0) 153 | else: 154 | table.append(1) 155 | out = img.point(table, '1') 156 | out.save(root+'{}.png'.format(ind),'png') 157 | ind = ind + 1 158 | 159 | if __name__ == '__main__': 160 | # 测试识别准确率 161 | test_num = 200 162 | correct_num = 0 163 | for i in range(test_num): 164 | if test_login() is True: 165 | correct_num += 1 166 | print("准确率{}%".format(correct_num*100/test_num)) 167 | # deal_img() 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /11verification_code.py: -------------------------------------------------------------------------------- 1 | """ 2 | 验证码 3 | 登陆古诗文网 4 | 将验证码下载到本地 5 | 在登陆页面中获取表单的两个重要参数 6 | 整个过程在会话状态下进行 7 | """ 8 | 9 | import requests 10 | from bs4 import BeautifulSoup 11 | 12 | headers = { 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 14 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' 15 | 'Chrome/75.0.3770.142 Safari/537.36', 16 | } 17 | def download_code(session): 18 | url = 'https://so.gushiwen.org/user/login.aspx?' \ 19 | 'from=http://so.gushiwen.org/user/collect.aspx' 20 | request = session.get(url=url, headers=headers) 21 | soup = BeautifulSoup(request.text,'lxml') 22 | 23 | ''' 24 | 问题:url相同,为什么每次获取的验证码不同 25 | 同个url下,通过cookie随机生成验证码 26 | 所以需要在获取验证码,登陆这个过程需要建立会话 27 | ''' 28 | img_src = 'https://so.gushiwen.org' + \ 29 | soup.find('img',id='imgCode')['src'] 30 | # print(img_src) 31 | img = session.get(url=img_src,headers=headers) 32 | with open('exe_file/11/code.png','wb') as fp: 33 | fp.write(img.content) 34 | 35 | # 查找表单需要的两个参数 36 | __VIEWSTATE = soup.find('input', id='__VIEWSTATE')['value'] 37 | __VIEWSTATEGENERATOR = soup.find('input', id='__VIEWSTATEGENERATOR')['value'] 38 | 39 | return __VIEWSTATE, __VIEWSTATEGENERATOR 40 | 41 | def login(__VIEWSTATE, __VIEWSTATEGENERATOR,session): 42 | post_url = 'https://so.gushiwen.org/user/login.aspx?' \ 43 | 'from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx' 44 | # 提示用户输入验证码 45 | code = input('input verification code:') 46 | data = { 47 | '__VIEWSTATE' : __VIEWSTATE, 48 | '__VIEWSTATEGENERATOR' : __VIEWSTATEGENERATOR, 49 | 'from' : 'http://so.gushiwen.org/user/collect.aspx', 50 | 'email' : '15625266605', 51 | 'pwd' : '123456', 52 | 'code' : code, 53 | 'denglu': '登录', 54 | } 55 | # 登陆并且将页面写入文件 56 | request = session.post(url=post_url,headers=headers,data=data) 57 | print(len(request.text)) 58 | with open('exe_file/11/gushi_error.html','w',encoding='utf-8') as file: 59 | file.write(request.text) 60 | def main(): 61 | # 创建会话 62 | session = requests.Session() 63 | # 下载验证码到本地 64 | __VIEWSTATE, __VIEWSTATEGENERATOR = download_code(session) 65 | 66 | login(__VIEWSTATE, __VIEWSTATEGENERATOR,session) 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /12video.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | headers = { 4 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 5 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' 6 | 'Chrome/75.0.3770.142 Safari/537.36', 7 | } 8 | 9 | 10 | # e:下载视频 11 | """ 12 | tip:视频播放窗口是无法右键F12的,正确的做法是在暂停、倍数的功能栏进入开发者模式,就可以简单获取视频的url 13 | """ 14 | url = 'http://v1-default.ixigua.com/0675cf76b8a56330683ebbae99e4986e/5d3bbed0/video/m/' \ 15 | '220ed97da2708af47afa4bb16d59e4eba1f116131fb7000082d6359fa977/?rc=amd1NDY0dTtpajM' \ 16 | 'zPDczM0ApQHRAbzw7NTs6MzgzMzUzNDUzNDVvQGg2dilAZzN3KUBmM3UpZHNyZ3lrdXJneXJseHdmOzpAa' \ 17 | 'C1wNGtqMG9rXy0tLS0vc3MtbyNvIy8uMy0wMy4uMC4tNDQ2LTojbyM6YS1vIzpgLXAjOmB2aVxiZitgXmJmK15xbDojMy5e' 18 | 19 | r = requests.get(url=url,headers=headers) 20 | 21 | with open('exe_file/12/1.mp4','wb') as file: 22 | file.write(r.content) 23 | 24 | 25 | ''' 26 | 首先向365yg.com发送请求 27 | 获取响应,解析响应,将里面所有的标题链接获取到 28 | 依次向每个标题链接发送请求 29 | 获取响应,解析响应,获取video标签的src属性 30 | 向src属性发送请求,获取响应,将内容保存到本地 31 | ''' 32 | 33 | # 爬取主页的推荐视频 34 | from lxml import etree 35 | import json 36 | from selenium import webdriver 37 | from selenium.webdriver.chrome.options import Options 38 | import time 39 | 40 | # 请求指定url的内容 41 | def handle_request(request_url): 42 | try: 43 | request = requests.get(url=request_url,headers=headers) 44 | request.raise_for_status() 45 | request.encoding = request.apparent_encoding 46 | return request 47 | except: 48 | print(request_url + ' get failed') 49 | return 'NULL' 50 | 51 | # 解析视频页,获取视频的url 52 | def handle_href(a_href)->str: 53 | # 通过chrome-headless解决 54 | path = r'exe_file/chromedriver.exe' 55 | chrome_options = Options() 56 | chrome_options.add_argument('--headless') 57 | chrome_options.add_argument('--disable-gpu') # 上面三行代码就是为了将Chrome不弹出界面,实现无界面爬取 58 | browser = webdriver.Chrome(path, options=chrome_options) 59 | browser.get(a_href) 60 | time.sleep(3) 61 | # 获取源码,生成tree对象,然后查找video里面的src属性 62 | ''' 63 | code:tree = etree.HTML(browser.page_source,'lxml) 64 | TypeError: Argument 'parser' has incorrect type (expected lxml.etree._BaseParser, got str) 65 | 去掉lxml完美解决 66 | ''' 67 | # 利用xpath获取视频的url 68 | tree = etree.HTML(browser.page_source) 69 | video_src = tree.xpath('//video/@src')[0] 70 | browser.close() 71 | return video_src 72 | 73 | # 获取主页的视频信息 74 | def handle_title(widen:int): 75 | # json内容会根据widen属性变化 76 | basic_url = 'http://365yg.com/api/pc/feed/?max_behot_time=1564196117&category=video_new&utm_source=toutiao' \ 77 | '&widen={}&tadrequire=true&as=A125ED93CB0BDA9&cp=5D3B3BBDAA498E1&_signature=.sLedBAXpAP3jqRhTQlB7.7C3m' 78 | # 获取请求 79 | request = handle_request(basic_url.format(widen)) 80 | # 解析json数据 81 | json_obj = json.loads(request.text) 82 | # 取出与视频相关的数据,data是一个字典元素的列表,每个元素都是一个视频的所有信息 83 | data = json_obj['data'] 84 | # 循环data列表,依次取出每一个视频信息 85 | for video_data in data: 86 | title = video_data['title'] 87 | a_href = 'http://365yg.com' + video_data['source_url'] 88 | print('downloading~...' + title) 89 | video_src = handle_href(a_href) 90 | # print(video_src) 91 | ''' 92 | 调用写好的函数,下载速度会慢很多 93 | request = handle_request(video_src) 94 | with open('exe_file/12/download/{}.mp4'.format(title), 'wb') as file: 95 | file.write(request.content) 96 | ''' 97 | r = requests.get(url=url, headers=headers) 98 | with open('exe_file/12/download/{}.mp4'.format(title), 'wb') as file: 99 | file.write(r.content) 100 | print('finish') 101 | def main(): 102 | handle_title(1) 103 | if __name__ == '__main__': 104 | main() -------------------------------------------------------------------------------- /13.1thread_ood.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import time 3 | # 写一个类,继承自threading.Thread 4 | class Singthread(threading.Thread): 5 | def __init__(self, name, a): 6 | super().__init__() 7 | self.name = name 8 | self.a = a 9 | def run(self): 10 | for x in range(1, 6): 11 | print('I am sing') 12 | time.sleep(1) 13 | 14 | class Dancethread(threading.Thread): 15 | def __init__(self, name, a): 16 | super().__init__() 17 | self.name = name 18 | self.a = a 19 | def run(self): 20 | for x in range(1, 6): 21 | print('I am dancing') 22 | time.sleep(1) 23 | 24 | def main(): 25 | # create thread 26 | tsing = Singthread('sing', 'cai') 27 | tdance = Dancethread('dance', 'crayon') 28 | 29 | # start thread 30 | tsing.start() 31 | tdance.start() 32 | 33 | 34 | # waiting thread end 35 | tsing.join() 36 | tdance.join() 37 | 38 | 39 | print('I am Main') 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /13.2thread_queue.py: -------------------------------------------------------------------------------- 1 | from queue import Queue 2 | 3 | # create queue 4 | q = Queue(5) 5 | # store data 6 | q.put('c') 7 | q.put('c++') 8 | q.put('python') 9 | q.put('java') 10 | q.put('matlab') 11 | # q.put('markdown', True, 3) 12 | # q.put('markdown', False) 13 | # q.put('markdown') 14 | 15 | # get data 16 | # 先进先出 17 | print(q.get()) 18 | print(q.get()) 19 | print(q.get()) 20 | print(q.get()) 21 | print(q.get()) 22 | 23 | print(q.get()) # 队空阻塞 24 | -------------------------------------------------------------------------------- /13.3Mthread_crawler.py: -------------------------------------------------------------------------------- 1 | """ 2 | 多线程爬虫 3 | 分析 4 | 两类线程:下载、解析 5 | 内容队列:下载线程往队列中put数据,解析线程从队列get数据 6 | 数据 7 | url队列:下载线程从url队列中get数据 8 | 写数据:上锁 9 | 10 | """ 11 | import threading 12 | import requests 13 | import json 14 | from lxml import etree 15 | from queue import Queue 16 | import time 17 | import timeit 18 | 19 | # 队空退出标志 20 | navi_EXIT = False 21 | line_EXIT = False 22 | route_EXIT = False 23 | data_EXIT = False 24 | 25 | # 获取当前时间 26 | localtime = time.asctime( time.localtime(time.time()) ) 27 | 28 | url = 'https://shenzhen.8684.cn' 29 | headers = { 30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 31 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' 32 | 'Chrome/75.0.3770.142 Safari/537.36', 33 | } 34 | # 请求指定url的内容 35 | def handle_request(request_url): 36 | try: 37 | request = requests.get(url=request_url,headers=headers) 38 | request.raise_for_status() 39 | request.encoding = request.apparent_encoding 40 | return request.text 41 | except: 42 | print(request_url + ' get failed') 43 | return 'NULL' 44 | 45 | # 首页导航 46 | def parse_navigation(): 47 | content = handle_request(request_url=url) 48 | tree = etree.HTML(content) 49 | 50 | # 获取以数字开头的连接 51 | number_href_list = tree.xpath('//div[@class="bus_kt_r1"]/a/@href') 52 | # 获取以字母开头的连接 53 | char_href_list = tree.xpath('//div[@class="bus_kt_r2"]/a/@href') 54 | # 将爬取的导航链接列表返回 55 | return number_href_list + char_href_list 56 | 57 | # 获取页面线程类-获取关键字页面 58 | class crawlerThread_getLine(threading.Thread): 59 | def __init__(self, threadName, naviQueue, lineQueue): 60 | super(crawlerThread_getLine, self).__init__() 61 | 62 | self.threadName = threadName 63 | self.naviQueue = naviQueue 64 | self.lineQueue = lineQueue 65 | def run(self): 66 | # 需保证主线程中队列空才能退出 67 | while not navi_EXIT: 68 | try: 69 | navi = self.naviQueue.get(False) # 设置False是为了避免队空阻塞死循环现象 70 | content = handle_request(url + navi) 71 | self.lineQueue.put(content) 72 | except: 73 | pass 74 | 75 | # 解析线程-将关键字页面中对应的线路url及名称解析出来 76 | class parseThread_getline(threading.Thread): 77 | def __init__(self, threadName, lineQueue, routeQueue): 78 | super(parseThread_getline, self).__init__() 79 | self.threadName = threadName 80 | self.lineQueue = lineQueue 81 | self.routeQueue = routeQueue 82 | def parse(self, content): 83 | tree = etree.HTML(content) 84 | route_infos = tree.xpath('//div[@class="stie_list"]/a') 85 | # print(len(route_infos)) 86 | for route_info in route_infos: 87 | # 该线路的url后缀 88 | route_suffix = route_info.xpath('.//@href')[0] 89 | # 名称 90 | route_name = route_info.xpath('.//@title')[0] 91 | # print(route_suffix,route_name) 92 | self.routeQueue.put((route_suffix, route_name)) 93 | 94 | def run(self): 95 | while not line_EXIT: 96 | try: 97 | content = self.lineQueue.get(False) 98 | self.parse(content) 99 | except: 100 | pass 101 | 102 | # 获取页面线程-获取具体线路信息页 103 | class crawlerThread_getRoute(threading.Thread): 104 | def __init__(self, threadName, routeQueue, dataQueue): 105 | super(crawlerThread_getRoute, self).__init__() 106 | 107 | self.threadName = threadName 108 | self.routeQueue = routeQueue 109 | self.dataQueue = dataQueue 110 | 111 | def run(self): 112 | while not route_EXIT: 113 | try: 114 | route_suffix, route_name = self.routeQueue.get(False) 115 | content = handle_request(url + route_suffix) 116 | # print(content) 117 | self.dataQueue.put(content) 118 | except: 119 | pass 120 | 121 | # 解析线程-解析具体线路的信息 122 | class parseThread_getRoute(threading.Thread): 123 | def __init__(self, threadName, dataQueue, result, lock): 124 | super(parseThread_getRoute, self).__init__() 125 | self.threadName = threadName 126 | self.dataQueue = dataQueue 127 | self.result = result 128 | self.lock = lock 129 | def parse(self, content): 130 | tree = etree.HTML(content) 131 | # 公交信息的标签位置 132 | bus_basic_infos = tree.xpath('//div[@class="bus_i_content"]')[0] 133 | 134 | # 获取线路名称、运营时间、票价 135 | bus_name = bus_basic_infos.xpath('./div[@class="bus_i_t1"]/h1/text()')[0] \ 136 | .replace(' ', '') # 替换掉特殊编码 137 | bus_runtime = bus_basic_infos.xpath('./p[1]/text()')[0].replace('运行时间:', '') 138 | bus_fares = bus_basic_infos.xpath('./p[2]/text()')[0].replace('票价信息:', '') 139 | bus_company = bus_basic_infos.xpath('./p[3]/a/text()')[0] 140 | bus_update = bus_basic_infos.xpath('./p[4]/text()')[0].replace('最后更新:', '') 141 | # print(bus_name) 142 | # print(bus_runtime) 143 | # print(bus_fares) 144 | # print(bus_company) 145 | # print(bus_update) 146 | 147 | # 获取线路站点 148 | ''' 149 | 坑:原本思路是找到//div[@class="bus_line_site"][1](第一个,也就是起点到终点的单程站集) 150 | 下的--div[@class="bus_site_layer"],但是一直找不到,所以最后直接找后者,这时得到的站集 151 | 是来回的,取列表的1/2,可以得到单程站集 152 | 填坑:实际上是"bus_line_site ",得再加一个空格 153 | ''' 154 | bus_line = tree.xpath('//div[@class="bus_site_layer"]') 155 | length = len(bus_line) 156 | bus_line = bus_line[:int(length / 2)] 157 | sites = [] 158 | for line in bus_line: 159 | for site in line.xpath('./div'): 160 | sites.append(site.xpath('./a/text()')[0]) 161 | # print(sites) 162 | 163 | bus_data = { 164 | '线路名称': bus_name, 165 | '运行时间': bus_runtime, 166 | '票价信息': bus_fares, 167 | '运营公司': bus_company, 168 | '更新时间': bus_update, 169 | '经过站点': sites, 170 | } 171 | # print(bus_data) 172 | with self.lock: 173 | # 公交线路放入结果中 174 | self.result.append(bus_data) 175 | print('\r' + self.threadName + '-当前已爬取线路数量:' + str(len(self.result)),end=' ') 176 | def run(self): 177 | while not data_EXIT: 178 | try: 179 | content = self.dataQueue.get(False) 180 | self.parse(content) 181 | except: 182 | pass 183 | 184 | def main(): 185 | 186 | # 初始化队列 187 | naviQueue = Queue() 188 | lineQueue = Queue() 189 | routeQueue = Queue() 190 | dataQueue = Queue() 191 | 192 | result = [] 193 | # 设置锁 194 | # 但好像没必要,因为list本来就是线程安全??? 195 | lock = threading.Lock() 196 | 197 | # 获取导航页面 198 | navi_list = parse_navigation() 199 | for navi in navi_list: 200 | naviQueue.put(navi) 201 | 202 | # ------------------------------------------------------------------------------------------- 203 | # 开启获取线程 204 | craw_getLine = ['craw-getLine' + str(i) for i in range(16)] 205 | # craw_getLine = ['craw-getLine1', 'craw-getLine2', 'craw-getLine3', 'craw-getLine4'] 206 | craw_getLine_Threads = [] 207 | for threadName in craw_getLine: 208 | thread = crawlerThread_getLine(threadName, naviQueue, lineQueue) 209 | thread.start() 210 | craw_getLine_Threads.append(thread) 211 | 212 | #------------------------------------------------------------------------------------------- 213 | # 开启解析线程-获取以某个关键字开头的所有线路概要信息 214 | parse_getLine = ['parse-getLine' + str(i) for i in range(16)] 215 | # parse_getLine = ['parse-getLine1', 'parse-getLine2', 'parse-getLine3', 'parse-getLine4'] 216 | parse_getLine_Threads = [] 217 | for threadName in parse_getLine: 218 | thread = parseThread_getline(threadName, lineQueue, routeQueue) 219 | thread.start() 220 | parse_getLine_Threads.append(thread) 221 | 222 | # ------------------------------------------------------------------------------------------- 223 | # 开启获取线程-获取具体的线路信息页 224 | craw_getRoute = ['craw-getRoute' + str(i) for i in range(16)] 225 | # craw_getRoute = ['craw-getRoute1', 'craw-getRoute2', 'craw-getRoute3', 'craw-getRoute4'] 226 | craw_getRoute_Threads = [] 227 | for threadName in craw_getRoute: 228 | thread = crawlerThread_getRoute(threadName, routeQueue, dataQueue) 229 | thread.start() 230 | craw_getRoute_Threads.append(thread) 231 | 232 | # ------------------------------------------------------------------------------------------- 233 | parse_getRoute = ['parse-getRoute' + str(i) for i in range(16)] 234 | # parse_getRoute = ['parse-getRoute1', 'parse-getRoute2', 'parse-getRoute3', 'parse-getRoute4'] 235 | parse_getRoute_Threads = [] 236 | for threadName in parse_getRoute: 237 | thread = parseThread_getRoute(threadName, dataQueue, result, lock) 238 | thread.start() 239 | parse_getRoute_Threads.append(thread) 240 | 241 | """ 242 | while......... 243 | for......... 244 | .join() 245 | 以上结构在下面一共设置四个,起到阻塞作用 246 | 主线程队空才是真正的队空情况,防止子线程在暂时队空的状态下退出 247 | """ 248 | #----------------------------------------------------------------------------------------------- 249 | while not naviQueue.empty(): 250 | pass 251 | 252 | global navi_EXIT 253 | navi_EXIT = True 254 | print('\rnaviQueue empty!',end='') 255 | 256 | for thread in craw_getLine_Threads: 257 | thread.join() 258 | #------------------------------------ 259 | 260 | while not lineQueue.empty(): 261 | pass 262 | 263 | global line_EXIT 264 | line_EXIT = True 265 | print('\rlineQueue empty!',end='') 266 | 267 | for thread in parse_getLine_Threads: 268 | thread.join() 269 | #------------------------------------ 270 | 271 | while not routeQueue.empty(): 272 | pass 273 | 274 | global route_EXIT 275 | route_EXIT = True 276 | print('\rrouteQueue empty!',end='') 277 | 278 | for thread in craw_getRoute_Threads: 279 | thread.join() 280 | #----------------------------------- 281 | 282 | while not dataQueue.empty(): 283 | pass 284 | 285 | global data_EXIT 286 | data_EXIT = True 287 | print('\rdataQueue empty!',end='') 288 | 289 | for thread in parse_getRoute_Threads: 290 | thread.join() 291 | #----------------------------------- 292 | 293 | 294 | # 将bus_data 存入一个result列表,构造<"result":result>键值对并存入一个新字典 295 | # 将字典转成json格式并存入json文件 296 | shenzhen_busLine = { 297 | 'json_name': '深圳公交线路汇总', 298 | 'updatetime': localtime, 299 | 'results': result 300 | } 301 | file = open('exe_file/13/bus_line.json', 'w', encoding='utf-8') 302 | 303 | """ 304 | json.dump() 305 | 把字典转成json串,并自动写入文件中 306 | dump参数是(字典,文件句柄,indent)。indent用于缩进美化json串的 307 | ensure_ascii=False用于写文件时有unicode时用,正常显示出中文来 308 | """ 309 | json.dump(shenzhen_busLine, file, indent=4, ensure_ascii=False) 310 | file.close() 311 | 312 | if __name__ == '__main__': 313 | # 比单线程的程序快5倍左右 314 | start = timeit.default_timer() 315 | main() 316 | print('\ntime:' + str(timeit.default_timer() - start)) 317 | -------------------------------------------------------------------------------- /13multiThread.py: -------------------------------------------------------------------------------- 1 | """ 2 | 多线程 3 | 面向过程 4 | t = threading.Thread(target=xxx(函数),name=xxx,args=(xx,xx)) 5 | target :线程启动之后要执行的函数 6 | name:线程的名字 7 | 获取线程名字:threading.current_thread().name 8 | args:主线程向子线程传递参数 9 | t.start():启动线程 10 | t.join():让主线程等待子线程结束 11 | 面向对象 12 | 定义一个类,继承自threading.Thread,重写一个方法run(), 13 | 需要线程名字、传递参数,重写构造方法,在重写构造方法的时候,主动调用父类的构造方法 14 | 线程同步问题 15 | 线程之间共享全局变量,很容易发生数据紊乱现象 16 | 使用线程锁解决 17 | 抢锁,谁抢到,谁先上锁,谁就先使用 18 | 创建锁 19 | suo = threading.Lock() 20 | 上锁 21 | suo = acquire() 22 | 释放锁 23 | suo.release() 24 | 25 | 队列(queue) 26 | 下载线程 27 | 解析线程,通过队列进行交互 28 | q = Queue(size) 29 | q.put('xxx')-如果队列满,程序卡在这里等待 30 | q.put(xxx,False)-如果队列满,程序直接报错 31 | q.put(xxx,True,3)-如果队列满,等待三秒再报错 32 | 33 | 获取数据 34 | q.get() 35 | q.get(False) 队空取元素直接报错 36 | q.get(True, 3) 队列空,程序等待3s报错 37 | 38 | q.empty() 判断队列是否满 39 | q.full() 判断队列是否已满 40 | q.qsize() 获取队列长度 41 | """ 42 | import threading 43 | import time 44 | # 一个主线程,一个唱歌,一个跳舞线程 45 | 46 | # TypeError: sing() takes 0 positional arguments but 1 was given 47 | # 需要接收参数a 48 | def sing(a): 49 | print(threading.current_thread().name, a) 50 | for x in range(1, 6): 51 | print('I am sing') 52 | time.sleep(1) 53 | def dance(a): 54 | print(threading.current_thread().name, a) 55 | for x in range(1, 6): 56 | print('I am dancing') 57 | time.sleep(1) 58 | def main(): 59 | a = 'superman' 60 | # 创建唱歌线程 61 | tsing = threading.Thread(target=sing, name="sing", args=(a,)) 62 | # 创建跳舞线程 63 | tdance = threading.Thread(target=dance, name="dance", args=(a,)) 64 | # 启动线程 65 | tsing.start() 66 | tdance.start() 67 | # 让主线程等待子线程结束之后在结束 68 | tsing.join() 69 | tsing.join() 70 | """ 71 | 先让子线程停 72 | 再让主线程停止 73 | """ 74 | print("I am Main") 75 | 76 | if __name__ == '__main__': 77 | main() -------------------------------------------------------------------------------- /1urllib_base.py: -------------------------------------------------------------------------------- 1 | """ 2 | code by python 3.7.2 3 | utf-8 4 | caixiaoxin 5 | index: 1 6 | """ 7 | 8 | 9 | """ 10 | urllib.request: 11 | urlopen:打开url 12 | urlretrieve(url,file_name):打开并保存url内容 13 | urllib.parse: 14 | quote(): url编码函数,将中文进行转化为%xxx 15 | unquote():url解码函数,将%xxx转化为指定字符 16 | urlencode():非法字符转码 17 | response: 18 | read() 读取字节类型 19 | geturl() 获取请求url 20 | getheaders() 21 | getcode() 22 | readlines() 23 | """ 24 | """ 25 | 字符串->二进制:encode() 26 | 二进制->字符串:decode() 27 | 默认utf8 28 | """ 29 | 30 | 31 | 32 | 33 | import urllib.request 34 | url = 'http://www.baidu.com/' 35 | response = urllib.request.urlopen(url=url) 36 | """ 37 | print(response) 38 | print(response.geturl()) 39 | print(response.getheaders()) 40 | print(response.getcode()) 41 | """ 42 | # print(response.read().decode()) 43 | # 读取的url内容存储 44 | with open('baidu.html','w',encoding='utf8') as file: 45 | file.write(response.read().decode()) 46 | """ 47 | 等同上方 48 | 只不过上述用utf8写入 49 | 在此用二进制写入 50 | 图片用这个! 51 | with open('baidu_1.html','wb') as flie: 52 | file.write(response.read()) 53 | """ 54 | 55 | 56 | 57 | 58 | 59 | 60 | # urlretrieve(url,file_name) 61 | picurl = "https://timgsa.baidu.com/timg?image&quality=80&" \ 62 | "size=b9999_10000&sec=1551421909555&di=9f9d69abb9fe596f493f9c6e3e52f08e&imgtype=0&" \ 63 | "src=http%3A%2F%2Fgss0.baidu.com%2F9vo3dSag _xI4khGko9WTAnF6hhy%2Fzhidao%2Fpic%2Fitem%" \ 64 | "2Fb151f8198618367a039b78422c738bd4b31ce51b.jpg" 65 | 66 | """ 67 | # 创建写入文件一条龙服务 68 | # urllib.request.urlretrieve(picurl,'ironMan.jpg') 69 | """ 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | import urllib.parse 78 | 79 | # url中若出现 $ 空格 中文等,就要对其进行编码 80 | url = 'http://www.baidu/index.html?name=钢铁侠&pwd=123456' 81 | ret = urllib.parse.quote(url) 82 | re = urllib.parse.unquote(url) 83 | re_1 = urllib.parse.unquote(picurl) 84 | print(ret) 85 | print(re) 86 | print(re_1) 87 | 88 | """ 89 | urllib.parse.urlencode 的应用! 90 | """ 91 | url = 'http://www.baidu.com/index.html' 92 | # 构造 http://www.baidu.co/index.html?name=goudan&age=18&sex=nv&height=180 93 | name = '钢铁侠' 94 | age = 18 95 | sex = 'nv' 96 | height = "180" 97 | 98 | data = { 99 | 'name' : name, 100 | 'age' : age, 101 | 'sex' : sex, 102 | 'height' : height, 103 | 'weight' : 180, 104 | } 105 | # 具有非法字符的自动转换功能 106 | construct_url = urllib.parse.urlencode(data) 107 | print(construct_url) 108 | construct_url = url + '?' + construct_url 109 | print(construct_url) 110 | 111 | # example:植入搜索关键字 112 | import urllib.parse 113 | baidu = 'http://www.baidu.com/s?' 114 | word = input('input the key you want:') 115 | _data = { 116 | 'ie' : 'utf-8', 117 | 'wd' : word, 118 | } 119 | # 非法字符转码 120 | query_string = urllib.parse.urlencode(_data) 121 | baidu += query_string 122 | response = urllib.request.urlopen(baidu) 123 | filename = word + '.html' 124 | with open(filename,'wb') as file: 125 | file.write(response.read()) 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | # 伪装UA 134 | # 构建请求对象:urllib.request.Request(self,url,data=None,headers={},...) 135 | url = 'http://www.baidu.com/' 136 | headers = { 137 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', 138 | } 139 | request = urllib.request.Request(url = url, headers=headers) 140 | 141 | response = urllib.request.urlopen(request) 142 | -------------------------------------------------------------------------------- /2ajax.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | author:caixiaoxin 4 | Ajax 即“Asynchronous Javascript And XML”(异步 JavaScript 和 XML),是指一种创建交互式网页应用的网页开发技术。 5 | Ajax = 异步 JavaScript 和 XML 或者是 HTML(标准通用标记语言的子集)。 6 | Ajax 是一种用于创建快速动态网页的技术。 7 | Ajax 是一种在无需重新加载整个网页的情况下,能够更新部分网页的技术。 8 | 通过在后台与服务器进行少量数据交换,Ajax 可以使网页实现异步更新。这意味着可以在不重新加载整个网页的情况下,对网页的某部分进行更新 9 | """ 10 | 11 | 12 | import urllib.parse 13 | import urllib.request 14 | 15 | 16 | 17 | 18 | 19 | """ 20 | 豆瓣爬取 21 | """ 22 | #如果url链接中出现我们要调配的参数并赋值常数,需要删除相关参数部分 23 | url = "https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&" 24 | page = 1 25 | limit = 1 26 | # 构建post表单 27 | data = { 28 | 'start':(page-1)*limit, 29 | 'limit':limit, 30 | } 31 | headers = { 32 | 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36", 33 | } 34 | 35 | """ 36 | request = urllib.request.Request(url=url,headers=headers) 37 | data = urllib.parse.urlencode(data).encode() 38 | response = urllib.request.urlopen(request,data = data) 39 | """ 40 | 41 | # 等价于三部曲 42 | query_string = urllib.parse.urlencode(data) 43 | url += query_string 44 | 45 | request = urllib.request.Request(url = url,headers=headers) 46 | response = urllib.request.urlopen(request) 47 | 48 | print(response.read().decode()) 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | """ 65 | kfc爬取 66 | """ 67 | post_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname' 68 | form_data = { 69 | 'cname': '深圳', 70 | 'pid': '', 71 | 'pageIndex': '2', 72 | 'pageSize': '10', 73 | } 74 | 75 | request = urllib.request.Request(url = post_url,headers=headers) 76 | form_data = urllib.parse.urlencode(form_data).encode() 77 | response = urllib.request.urlopen(request,data = form_data) 78 | print(response.read().decode()) 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | """ 95 | 贴吧爬取 96 | """ 97 | 98 | # tieba_url = 'http://tieba.baidu.com/f?ie=utf-8&kw=python&red_tag=d3356873073' 99 | tieba_url = 'http://tieba.baidu.com/f?ie=utf-8' 100 | # 页码变化参数 pn 101 | # pn按50递增 102 | 103 | 104 | for page in range(0,10): 105 | form_data = { 106 | 'kw': 'python', 107 | 'pn': (page-1)*50 108 | } 109 | form_data = urllib.parse.urlencode(form_data).encode() 110 | request = urllib.request.Request(url = tieba_url,headers = headers) 111 | response = urllib.request.urlopen(request,data = form_data) 112 | 113 | with open('tiebaPage//'+str(page)+'.html','wb') as file: 114 | file.write(response.read()) 115 | 116 | 117 | 118 | 119 | 120 | """ 121 | 异常处理 122 | """ 123 | import urllib.error 124 | url = 'http://www.maodan.com/' 125 | 126 | #URLerror 127 | try: 128 | response = urllib.request.urlopen(url) 129 | except urllib.error.URLError as e: 130 | print(e) ## 131 | 132 | #HTTPerror 133 | 134 | -------------------------------------------------------------------------------- /4handler.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Handler处理器,自定义Opener 4 | urlopen() 给一个url,发送请求,获取响应 5 | Request() 定制请求头,创建请求对象 6 | 高级功能:使用代理,cookie 7 | 8 | 代理: 9 | 配置:浏览器配置:高级-代理设置 10 | 代码配置:就此可以在xici上爬取ip代理,然后随机分配给爬虫,来突破爬取网站所给的爬取频率 11 | 同时也防止自身ip被封 12 | cookie 13 | #服务器端访问网站所留下的识别信息# 14 | 模拟登陆:抓包获取cookie 15 | 通过cookieJar保存模拟登陆所得到的cookie 16 | """ 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | """利用handler和opener获取页面的基本操作""" 25 | import urllib.request 26 | import urllib.parse 27 | 28 | url = 'http://baidu.com/' 29 | 30 | headers = { 31 | 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36", 32 | } 33 | 34 | # 创建 35 | handler = urllib.request.HTTPHandler() 36 | 37 | # 通过hander创建一个opener,使用opener中的方法发送请求 38 | opener = urllib.request.build_opener(handler) 39 | 40 | # 构建请求对象 41 | request = urllib.request.Request(url,headers=headers) 42 | 43 | # 发送请求 44 | response = opener.open(request) 45 | # print(response.read().decode()) 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | """ 57 | 利用ProxyHandler进行ip伪装 58 | ip伪装会出现问题,留坑 59 | 在西刺网获取ip进行实践成功率非常低 60 | 响应时间过长及ip伪装失败(成功响应但是ip仍为本机) 61 | """ 62 | 63 | url = 'https://baidu.com/s?' 64 | headers = { 65 | 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36", 66 | } 67 | data = { 68 | 'ie' : 'utf-8', 69 | 'wd' : 'ip', 70 | } 71 | # 构造表单请求链接 72 | query_string = urllib.parse.urlencode(data) 73 | url += query_string 74 | 75 | # 伪装ip格式 76 | handler = urllib.request.ProxyHandler({'http':'121.232.148.73:9000'}) 77 | opener = urllib.request.build_opener(handler) 78 | 79 | request = urllib.request.Request(url,headers=headers) 80 | # 很神奇,三部曲失效,所以只能构造url了 81 | # data = urllib.parse.urlencode(data).encode() 82 | response = opener.open(request) 83 | 84 | with open("ip1.html",'wb') as file: 85 | file.write(response.read()) 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | """ 94 | 利用fiddler抓取cookie实现人人网主页的获取 95 | 并没有抓取登陆时的json因为文件特别特别多 96 | 抓取的json来源于进入主页所发送的表单请求 97 | 98 | 问题:cookie是实时的,所以该方法捕获的cookie需要实时抓包,过期失效 99 | 100 | 该问题引出下一个实例,登陆人人网并进入个人主页 101 | """ 102 | 103 | renren_url = 'http://www.renren.com/969920379/profile' 104 | 105 | headers = { 106 | 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36", 107 | 'Cookie': 'anonymid=jsrj643rtki53d; depovince=GUZ; _r01_=1; ln_uact=15625266605; ' 108 | 'ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; ick_login=c8e7f87f-75da-4f2f-b840-104909404637; ' 109 | 'first_login_flag=1; JSESSIONID=abcJbn-wkjg1eh_WTZiLw; jebecookies=7817aaaf-eb35-4f05-b071-f3e5f75f07c2|||||;' 110 | ' _de=E00E5A467C4B17304268C536701AF72D; p=c14a2da80879c2daa73cc1a3853720609; t=1e2cbc27606389927d404e61f48774c19; ' 111 | 'societyguester=1e2cbc27606389927d404e61f48774c19; id=969920379; xnsid=b72aafa1; ver=7.0; loginfrom=null; ' 112 | 'wp_fold=0log=[{"hostId":"969920379","targetTag":"name_click","sendUserId":"969920379"}]&requestToken=-1644252112&_rtk=69bfb28a', 113 | } 114 | 115 | request = urllib.request.Request(url = renren_url,headers = headers) 116 | response = urllib.request.urlopen(request) 117 | 118 | with open('renren.html','wb') as file: 119 | file.write(response.read()) 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | """ 133 | http.cookiejar的应用,保存cookie,通过保存的cookie访问主页 134 | 密码是加密过的,所以并不能构造表单实现登陆 135 | 表单的获取需要手动登陆得到post请求 136 | 登陆后,由于cookie提前保存,所以能够登陆该账号的其他页面 137 | """ 138 | 139 | 140 | import http.cookiejar 141 | #模拟真实浏览器,发送完post请求猴,将cookie保存到代码中 142 | 143 | # 创建一个cookie对象 144 | cj = http.cookiejar.CookieJar() 145 | # 通过cookie创建一个handler 146 | handler = urllib.request.HTTPCookieProcessor(cj) 147 | # 根据handler创建一个opener 148 | opener = urllib.request.build_opener(handler) 149 | 150 | url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019212154558' 151 | formdata = { 152 | 'email':'15625266605', 153 | 'icode' :'', 154 | 'origURL':'http://www.renren.com/home', 155 | 'domain':'renren.com', 156 | 'key_id':'1', 157 | 'captcha_type': 'web_login', 158 | 'password': '1162c49a98a09a374364c99e2ad203b82211bc9cfdf8411e3b47d3ae268ec869', 159 | 'rkey': '54fa0fe478cb62a6ae1184e8e15c9dbb', 160 | 'f':'http%3A%2F%2Fwww.renren.com%2F969920379', 161 | } 162 | headers = { 163 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 164 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' 165 | } 166 | 167 | request = urllib.request.Request(url = url,headers = headers) 168 | formdata = urllib.parse.urlencode(formdata).encode() 169 | response = opener.open(request,data = formdata) 170 | 171 | 172 | 173 | get_url = 'http://www.renren.com/969920379/profile' 174 | 175 | # 进入自己的主页 176 | request = urllib.request.Request(url=get_url,headers = headers) 177 | response = opener.open(request) 178 | 179 | with open('renren.html','wb') as file: 180 | file.write(response.read()) 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /5.1正则爬取糗.py: -------------------------------------------------------------------------------- 1 | """ 2 | 5.1 正则表达式-爬取糗图 3 | code utf8 4 | date 03/08/2019 5 | author caixiaoxin 6 | """ 7 | 8 | import urllib.parse 9 | import urllib.request 10 | import re 11 | def download_image(content,file): 12 | # pattern = re.compile(r'
.*?(.*?).*?
',re.S) 13 | """ 14 | re.S 可以使 . 具有匹配换行的功能 15 | 16 | 正则中加入括号,表示匹配的目标字段,也就是想要获取的信息 17 | 18 | pattern 为得到的图片的url 19 | _pattern 为得到图片相应的段子 20 | """ 21 | pattern = re.compile(r'
.*?',re.S) 22 | _pattern = re.compile(r'div class="content".*?(.*?).*?
',re.S) 23 | 24 | """ 25 | findall:返回匹配目标字段的列表 26 | """ 27 | image_urls = pattern.findall(content) 28 | image_titles = _pattern.findall(content) 29 | 30 | 31 | # 将爬取得到的图片url及段子写入文件 32 | for index in range(len(image_urls)): 33 | image_urls[index] = 'http:' + image_urls[index] 34 | try: 35 | file.writelines(image_urls[index] + ':\n' + 36 | image_titles[index] + '\n\n') 37 | except: 38 | pass 39 | 40 | def main(): 41 | url = 'https://www.qiushibaike.com/pic/page/{}/' 42 | headers = { 43 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', 44 | } 45 | file = open('qiutu.html', 'w') 46 | 47 | start_page = 1 48 | end_page = 5 49 | for page in range(start_page,end_page+1): 50 | request = urllib.request.Request(url = url.format(page),headers = headers) 51 | content = urllib.request.urlopen(request).read().decode() 52 | download_image(content,file) 53 | 54 | if __name__ == '__main__': 55 | main() 56 | -------------------------------------------------------------------------------- /5.2正则爬取励志网并建立文章集合页面.py: -------------------------------------------------------------------------------- 1 | """ 2 | 5.2 正则表达式-爬取励志网 3 | code utf8 4 | date 03/08/2019 5 | author caixiaoxin 6 | """ 7 | 8 | 9 | import urllib.parse 10 | import urllib.request 11 | import re 12 | import os 13 | 14 | 15 | headers = { 16 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', 17 | } 18 | # html文件头-不加会乱码 19 | html_head = """ 20 | 21 | 22 | 5.2正则:爬取励志网 23 | 24 | """ 25 | 26 | file_address = 'lizhi.html' 27 | 28 | # 文件存在则先删除原有文件 29 | if os.path.exists(file_address): 30 | os.remove(file_address) 31 | else: 32 | os.mkdir(file_address) 33 | 34 | # 写入文件头 35 | file = open(file_address,'a',encoding='utf8') 36 | file.write(html_head) 37 | 38 | 39 | # 获取url请求 40 | def get_request(url): 41 | request = urllib.request.Request(url = url,headers = headers) 42 | return request 43 | 44 | # 提取每篇文章的标题和链接 45 | def parse_content(content): 46 | """ 47 | # 努力,奋斗,坚持,不抛弃,不放弃,一切皆有可能 48 | #

我不知道年少轻狂,我只知道胜者为王——追梦赤子心

49 | 查出标签有两个版本,一个带b标签,一个没有 50 | 决定暂时保留b标签,过后单独处理 51 | """ 52 | 53 | pattern = re.compile(r'

(.*?)

') 54 | 55 | articleList = pattern.findall(content) 56 | # print(len(articleList)) 57 | for article in articleList: 58 | # 可能出现带b标签的标题,清除 59 | # article[1].replace('','').replace('','') 60 | get_text(url = 'http://www.yikexun.cn'+article[0],title = article[1].replace('','').replace('','')) 61 | 62 | 63 | # 提取文章内容 64 | def get_text(url,title): 65 | request = urllib.request.Request(url = url,headers = headers) 66 | content = urllib.request.urlopen(request).read().decode() 67 | 68 | # 提取文章内容 69 | pattern = re.compile(r'
(.*?)
',re.S) 70 | article = pattern.findall(content)[0].strip() 71 | 72 | 73 | """ 74 | bug: 75 | 写入html后打开,会出现文章渐进的排版错误 76 | 77 | 源:因为有些文章结尾不明多出

,缺失结束标签 78 | 79 | 修复:去除内容结尾空格,检查尾缀是否为

,将

替换成结束标签 80 | """ 81 | title = title.strip() 82 | if article[-3:] == "

": 83 | article = article[:-3] + "" 84 | 85 | 86 | 87 | # 美化:去除所有无法加载(其实也就是全部)的图片 88 | # v2-fbdde028d48d572b2425965acf058add_hd.png 89 | image_pattern = re.compile(r'') 90 | 91 | """ 92 | 这个替换挺简便的 93 | """ 94 | article = image_pattern.sub('',article) 95 | parse_html(title = title,article = article) 96 | 97 | 98 | 99 | # 文章写入html文件 100 | def parse_html(title,article): 101 | 102 | #标题加上h1标签,设置每篇文章排版 103 | complete_arc = '

%s

%s\n\n'%(title,article) 104 | 105 | file.write(complete_arc) 106 | 107 | 108 | 109 | def main(): 110 | url = 'http://www.yikexun.cn/lizhi/qianming/list_50_{}.html' 111 | start_page = 1 112 | end_page = 10 113 | 114 | for page in range(start_page,end_page+1): 115 | request = get_request(url.format(page)) 116 | 117 | #预览页(主页内容) 118 | content = urllib.request.urlopen(request).read().decode() 119 | parse_content(content) 120 | 121 | 122 | 123 | 124 | 125 | 126 | if __name__ == '__main__': 127 | main() 128 | 129 | -------------------------------------------------------------------------------- /58crawler/58.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/58crawler/58.ttf -------------------------------------------------------------------------------- /58crawler/58decode.py: -------------------------------------------------------------------------------- 1 | import re 2 | import lxml.html 3 | import base64 4 | from fontTools.ttLib import TTFont 5 | import requests 6 | import random 7 | import sqlite3 8 | import time 9 | 10 | db = sqlite3.connect("58.db") 11 | cursor = db.cursor() 12 | 13 | UA = [ 14 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 15 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 16 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 17 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" 18 | ] 19 | # https://sz.58.com/shixia/chuzu/pn68/?PGTID=0d3090a7-017c-74d8-4e30-b085460b77a1&ClickID=2 20 | # https://sz.58.com/shixia/chuzu/pn66/?PGTID=0d3090a7-017c-74e7-0457-6697c22f0410&ClickID=2 21 | headers = { 22 | "User-Agent": random.choice(UA) 23 | } 24 | 25 | 26 | def resp(i): 27 | 28 | base_url = "https://sz.58.com/chuzu/pn{}/?key=大望&PGTID=0d3090a7-017c-74d8-4e30-b085460b77a1&ClickID=2" 29 | response = requests.get(base_url.format(i), headers=headers) 30 | print("正在下载:", response.url) 31 | return response 32 | 33 | 34 | def get_base64_str(response): 35 | base_font = re.compile("base64,(.*?)\'") 36 | base64_str = re.search(base_font, response.text).group().split(',')[1].split('\'')[0] 37 | return base64_str 38 | 39 | 40 | def make_font_file(base64_str): 41 | b = base64.b64decode(base64_str) 42 | with open("58.ttf", "wb") as f: 43 | f.write(b) 44 | 45 | 46 | def make_dict(): 47 | font = TTFont('58.ttf') 48 | b = font['cmap'].tables[2].ttFont.getReverseGlyphMap() # 编码对应的数字 49 | c = font['cmap'].tables[2].ttFont.tables['cmap'].tables[1].cmap # 页面的十六进制数对应的编码 50 | return b, c 51 | 52 | 53 | def parse_title(text): 54 | s = "" 55 | title_re = re.compile("\s") 56 | html = lxml.html.fromstring(text) 57 | title = html.xpath('//a[@class="strongbox"]//text()')[0] 58 | title = re.sub(title_re, '', title) 59 | for i in title: 60 | encode_str = str(i.encode("unicode-escape")).split(r'\\u')[-1].replace('\'', '').replace(r'b(', '').strip() 61 | try: 62 | num, code = make_dict() 63 | if len(encode_str) != 4: 64 | i = i 65 | elif int(encode_str, 16) not in code: 66 | i = i 67 | else: 68 | i = str(num[code[int(encode_str, 16)]] - 1) 69 | s += i 70 | except: 71 | s = "None" 72 | return s 73 | 74 | 75 | def parse_price(text): 76 | s = "" 77 | html = lxml.html.fromstring(text) 78 | price_code = html.xpath('//div[@class="money"]/b/text()')[0] 79 | price_code = price_code.strip().replace('\r\n', '').replace(' ', '') 80 | price_encode_str = str(price_code.encode("unicode-escape")).split('\'')[1].split('-') 81 | if len(price_encode_str) > 1: 82 | s1 = "" 83 | s2 = "" 84 | encode_list1 = price_encode_str[0].split(r"\\u")[1:] 85 | encode_list2 = price_encode_str[1].split(r"\\u")[1:] 86 | for i in encode_list1: 87 | price = int(i, 16) 88 | num, code = make_dict() 89 | s1 += str(num[code[price]] - 1) 90 | for i in encode_list2: 91 | price = int(i, 16) 92 | num, code = make_dict() 93 | s2 += str(num[code[price]] - 1) 94 | s = s1 + '-' + s2 95 | 96 | else: 97 | str_list = price_encode_str[0].split(r'\\u')[1:] 98 | for i in str_list: 99 | price = int(i, 16) 100 | try: 101 | num, code = make_dict() 102 | s += str(num[code[price]] - 1) 103 | except: 104 | s = "None" 105 | 106 | return s 107 | 108 | 109 | def parse_room(text): 110 | s = "" 111 | html = lxml.html.fromstring(text) 112 | p_rooms = html.xpath('//p[@class="room strongbox"]/text()')[0] 113 | room_re = re.compile('[\s]') 114 | room_re1 = re.compile(r'[m²]') 115 | room_re2 = re.compile(r'/') 116 | rooms = re.sub(room_re, '', p_rooms) 117 | rooms = re.sub(room_re1, "平米", rooms) 118 | rooms = re.sub(room_re2, "至", rooms) 119 | for i in rooms: 120 | # print(i.encode("unicode-escape")) 121 | encode_str = str(i.encode("unicode-escape")).split(r'\\u')[-1].replace('\'', '').replace(r'b/', '').strip() 122 | # print(encode_str) 123 | try: 124 | num, code = make_dict() 125 | if len(encode_str) != 4: 126 | i = i 127 | elif int(encode_str, 16) not in code: 128 | i = i 129 | else: 130 | i = str(num[code[int(encode_str, 16)]] - 1) 131 | s += i 132 | except: 133 | s = "None" 134 | return s 135 | 136 | #debug 137 | def parse_dist(text): 138 | s = "" 139 | html = lxml.html.fromstring(text) 140 | p_dist_re = re.compile('\skm') 141 | try: 142 | p_dist = html.xpath('//p[@class="add"]/text()')[3] 143 | p_dist = ''.join(p_dist).replace(' ', '') 144 | p_dist = re.sub(p_dist_re, '千米', p_dist) 145 | for i in p_dist: 146 | encode_str = str(i.encode("unicode-escape")).split(r'\\u')[-1].replace('\'', '').replace(r'\\r', 147 | '').replace(r'\\n', 148 | '').replace( 149 | r'b.', '').strip() 150 | num, code = make_dict() 151 | if len(encode_str) != 4: 152 | i = i 153 | elif int(encode_str, 16) not in code: 154 | i = i 155 | else: 156 | i = str(num[code[int(encode_str, 16)]] - 1) 157 | s += i 158 | dist = s 159 | except: 160 | dist = "暂无" 161 | return dist 162 | 163 | 164 | def short_rent(text): 165 | html = lxml.html.fromstring(text) 166 | try: 167 | rent = html.xpath('//p[@class="room"]/b/text()')[0] 168 | except: 169 | rent = "不可短租" 170 | return rent 171 | 172 | 173 | def parse_li(response): 174 | li_re = re.compile('
  • ') 175 | li_list = re.findall(li_re, response.text) 176 | return li_list 177 | 178 | 179 | def parse_target(text): 180 | html = lxml.html.fromstring(text) 181 | try: 182 | target = html.xpath('//p[@class="spec"]/span/text()') 183 | target = ','.join(target) 184 | except: 185 | target = "暂无" 186 | return target 187 | 188 | 189 | if __name__ == '__main__': 190 | for i in range(1, 71): 191 | response = resp(i) 192 | time.sleep(5) 193 | base64_str = get_base64_str(response) 194 | make_font_file(base64_str) 195 | make_dict() 196 | li_list = parse_li(response) 197 | for i in li_list: 198 | # print(i) 199 | title = parse_title(i) 200 | price = parse_price(i) 201 | room = parse_room(i) 202 | dist = parse_dist(i) 203 | rent = short_rent(i) 204 | target = parse_target(i) 205 | city = "深圳" 206 | print(title,price,room,dist,rent,target) 207 | # cursor.execute("insert into home(title, price, room, dist, rent,target, city) values (?,?,?,?,?,?,?)", 208 | # [title, price, room, dist, rent, target, city]) 209 | db.commit() 210 | -------------------------------------------------------------------------------- /6.1read_list.py: -------------------------------------------------------------------------------- 1 | 2 | file = open('exe_file/hello.txt','r',encoding='utf8') 3 | string = file.read() 4 | file.close() 5 | 6 | lt = eval(string) 7 | print(lt[0]['name']) 8 | 9 | # out: 10 | # 宫本武藏 11 | # 小田纯一郎 -------------------------------------------------------------------------------- /6xpath.py: -------------------------------------------------------------------------------- 1 | """ 2 | TODO:xpath学习 3 | TEST:段子网爬取 4 | Author:caixiaoxin 5 | Date:2019/7/10 6 | """ 7 | """ 8 | xpath? 9 | xml是用来存储和传输数据的 10 | 和html的不同点: 11 | 1 html是用来显示数据的,xml是用来传输的 12 | 2 html是固定的,xml标签是自定义的 13 | 14 | 15 | Harry Potter 16 | K.Rowling 17 | 2005 18 | 29.99 19 | 20 | 21 | 22 | 文档节点 23 | K.Rowling 元素节点 24 | lang="en" 属性节点 25 | 26 | // 不考虑位置的查找 27 | ./ 从当前节点开始往下查找 28 | .. 从当前节点的父结点查找 29 | @ 选取属性 30 | 31 | e: 32 | /bookstore/book 选取根节点bookstore下面所有直接子节点的book 33 | //book 选取所有的book 34 | bookstore//book 查找bookstore 下面所有的book,不管所在位置 35 | /bookstore/book[1] bookstore 里面的第一个book 36 | /bookstore/book[last()] bookstore里面的最后一个book 37 | /bookstore/book[position()<3] 前两个book 38 | //title[@lang] 所有带有lang属性的title 39 | //title[@lang='eng'] 所有的lang属性为eng的title节点 40 | * 任何元素节点 41 | 42 | 43 | 属性定位 44 | //input[@id="kw"] 45 | //input[@class="g s_ btn"] 46 | 层级定位 47 | //div[@id="head"]/div/div[2]/a[@class="toindex"] --索引从1开始 48 | //div[@id="head"]//a[@class="toindex"] --双斜杠表示下面的所有a节点,不管位置 49 | 逻辑运算 50 | //input[@class="s_ipt" and @name="wd] 51 | 模糊匹配 52 | contains://input[contains(@class,"s_i")]---所有input,有class属性,并且属性中带s_i节点 53 | //input[contains(text(),"爱")] 54 | starts-with://input[starts-with(@class,"s")]---所有的input,有class属性,并且属性以s开头 55 | 取文本 56 | //div[@id="ul"]/a[5]/text() 57 | 所有文本 58 | //div[@id="n1"]//text() div下所有的文本 59 | 60 | 取属性 61 | //div[@id="ul"]/a[5]/@href 62 | 63 | 代码中应用xpath 64 | from lxml import etree 65 | 将html文档变成一个对象,然后调用对象的方法去查找指定的节点 66 | 1 本地文件 67 | tree=etree.parse 68 | 2 网络文件 69 | tree=etree.HTML(网页字符串) 70 | 71 | """ 72 | 73 | # xpath测试 74 | 75 | from lxml import etree 76 | # 使用lxml.etree.parse()解析html文件,该方法默认使用的是“XML”解析器,所以如果碰到不规范的html文件时就会解析错误 77 | # lxml.etree.XMLSyntaxError: Opening and ending tag mismatch: meta line 3 and head, line 3, column 87 78 | # 创建html解析器,增加parser参数 79 | parser = etree.HTMLParser(encoding="utf-8") 80 | tree = etree.parse('exe_file/xpath.html', parser=parser) 81 | # print(tree) 82 | 83 | ret = tree.xpath('//div[@class="tang"]/ul/li[1]/text()') #取文本 84 | print(ret) #out:['\r\n 停车坐爱枫林晚,霜叶红于二月花\r\n '] 85 | 86 | ret1 = tree.xpath('//div[@class="tang"]/ul/li[last()]/a/@href') #取属性 87 | print(ret1) #out:['http://www.baidu.com/'] 88 | 89 | ret2 = tree.xpath('//div[@class="tang"]/ul/li[@class="love"]') #层次定位 90 | print(ret2) #out:[, ] 91 | 92 | ret3 = tree.xpath('//div[@class="tang"]/ul/li[@class="love" and @name="yang"]') #逻辑定位 93 | print(ret3) #out:[] 94 | 95 | ret4 = tree.xpath('//li[contains(@class,"l")]') #模糊搜索 96 | print(ret4) #out:[, , , , ] 97 | 98 | ret5 = tree.xpath('//li[contains(text(),"爱")]/text()') #模糊文本搜索 99 | print(ret5) #['\r\n 停车坐爱枫林晚,霜叶红于二月花\r\n ', '爱就一个字,我只说一次', '爱情36计,我要立刻美丽'] 100 | 101 | 102 | ret6 = tree.xpath('//li[starts-with(@class,"li")]/text()') #模糊匹配 103 | print(ret6) #['\r\n 乍暖还寒时候,最难将息\r\n ', '\r\n 三杯两盏淡酒\r\n '] 104 | 105 | ret7 = tree.xpath('//div[@class="song"]//text()') 106 | print(ret7) # ['\r\n 火药\r\n ', '指南针', '\r\n ', '印刷术', '\r\n 造纸术\r\n '] 107 | 108 | 109 | # 不建议采用,因为编码原因难以转换 110 | ret8 = tree.xpath('//div[@class="song"]') #提取拼接文本 111 | str = ret8[0].xpath('string(.)') 112 | print(str) 113 | # 火药 114 | # 指南针 115 | # 印刷术 116 | # 造纸术 117 | 118 | 119 | 120 | """ 121 | 爬取段子网 122 | """ 123 | import urllib.request 124 | import urllib.parse 125 | from lxml import etree 126 | 127 | # 构造url,返回请求内容 128 | def handle_request(url,page): 129 | # 构造头部信息 130 | headers = { 131 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 132 | 'Accept-Encoding': 'gzip, deflate', 133 | 'Accept-Language': 'zh-CN,zh;q=0.9', 134 | 'Cache-Control': 'max-age=0', 135 | 'Connection': 'keep-alive', 136 | 'Host': 'duanziwang.com', 137 | 'Upgrade-Insecure-Requests': 1, 138 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' 139 | } 140 | # 构造相应页面的url 141 | url = url.format(page) 142 | print(url) 143 | request = urllib.request.Request(url, headers=headers) 144 | return request 145 | 146 | # html内容解析 147 | def parse_content(content): 148 | 149 | # 构造对象 150 | tree = etree.HTML(content) 151 | # 筛选本页面的文章概要列表 152 | article_list = tree.xpath('//article[@id and @class="post"]') 153 | # print(len(article_list)) 154 | 155 | # 概要中提取信息 156 | for article in article_list: 157 | title = article.xpath('.//div[@class="post-head"]/h1/a/text()') [0] #获取标题 158 | # print(title) 159 | text = article.xpath('.//div[@class="post-content"]//text()') #获取文本 160 | content_text = '' 161 | for word in text: 162 | word = word.strip() 163 | content_text += word.replace('\n','').replace('\r','') 164 | # 空文本进行信息补充 165 | if len(content_text) == 0: 166 | content_text = "这个标题有点长" 167 | 168 | # 提取时间 169 | time = article.xpath('.//div[@class="post-meta"]/time[@class="post-date" and @datetime]/text()')[0] 170 | # print(time) 171 | 172 | # 提取点赞数 173 | like_num = article.xpath('.//div[@class="post-meta"]/time[@class="post-date"]/a/span/text()')[0] 174 | # print(like_num) 175 | 176 | print("title:" + title) 177 | print("time:" + time) 178 | print("like:" + like_num) 179 | print("text:" + content_text) 180 | print("------------------------------") 181 | 182 | 183 | def main(): 184 | # start_page = int(input('begin:')) 185 | # end_page = int(input('end:')) 186 | 187 | start_page = 1 188 | end_page = 100 189 | 190 | url = 'http://duanziwang.com/page/{}/' 191 | for page in range(start_page,end_page+1): 192 | request = handle_request(url, page) 193 | content = urllib.request.urlopen(request).read().decode() 194 | # print(content) 195 | 196 | parse_content(content) 197 | 198 | if __name__ == '__main__': 199 | main() 200 | 201 | -------------------------------------------------------------------------------- /7pictureLoad.py: -------------------------------------------------------------------------------- 1 | # http://sc.chinaz.com/tupian/xingganmeinvtupian.html 2 | """ 3 | 懒加载:只显示可视区的图片 4 | 实现方式 5 | 6 | 监视 -> 7 | 特点:找不到src 8 | """ 9 | import urllib.request 10 | import urllib.parse 11 | from lxml import etree 12 | import os 13 | 14 | # 构造请求 15 | def handle_request(url, page): 16 | if page == 1: 17 | url = url.format('') 18 | else: 19 | url = url.format('_' + str(page)) 20 | #print(url) 21 | # 构造头部信息 22 | headers = { 23 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 24 | 'Accept-Language': 'zh-CN,zh;q=0.9', 25 | 'Cache-Control': 'max-age=0', 26 | 'Connection': 'keep-alive', 27 | 'Upgrade-Insecure-Requests': 1, 28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' 29 | } 30 | request = urllib.request.Request(url=url,headers=headers) 31 | return request 32 | 33 | # 页面信息解析提取 34 | def parse_content(content): 35 | tree = etree.HTML(content) 36 | # //div[@id="container"]/div/div/a/img/@src 37 | image_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src2') 38 | # 懒加载 39 | # print(image_list) 40 | for image_url in image_list: 41 | download_image(image_url) 42 | 43 | # 下载图片 44 | def download_image(image_url): 45 | # 下载目录 46 | dirpath = 'exe_file/xinggan' 47 | # 不存在目录即创建 48 | if not os.path.exists(dirpath): 49 | os.mkdir(dirpath) 50 | # 生成文件名 51 | filename = os.path.basename(image_url) 52 | # 加入文件 53 | filepath = os.path.join(dirpath, filename) 54 | 55 | # 构造头部信息 56 | headers = { 57 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 58 | 'Accept-Language': 'zh-CN,zh;q=0.9', 59 | 'Cache-Control': 'max-age=0', 60 | 'Connection': 'keep-alive', 61 | 'Upgrade-Insecure-Requests': 1, 62 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' 63 | } 64 | # 保存图片 65 | request = urllib.request.Request(url=image_url, headers=headers) 66 | response = urllib.request.urlopen(request) 67 | with open(filepath, 'wb') as fp: 68 | fp.write(response.read()) 69 | 70 | 71 | def main(): 72 | url = 'http://sc.chinaz.com/tupian/xingganmeinvtupian{}.html' 73 | start_page = 1 74 | end_page = 2 75 | for page in range(start_page,end_page+1): 76 | request = handle_request(url, page) 77 | # UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start by 78 | # slove:headers有一句'Accept-Encoding': 'gzip, deflate',删掉就好了 79 | content = urllib.request.urlopen(request).read().decode() 80 | parse_content(content) 81 | 82 | 83 | if __name__ == '__main__': 84 | main() -------------------------------------------------------------------------------- /8jsonpath.py: -------------------------------------------------------------------------------- 1 | """ 2 | jsonpath--用来解析json数据 3 | python处理json用到的函数 4 | import json 5 | json.dumps()--将字典获列表转化为json的字符串 6 | json.loads()--将json转化为python对象 7 | json.dump()---将字典/列表转化为json格式字符串并写入文件中 8 | json.load()---从文件中读取json格式字符串,转化为python对象 9 | 前端处理 10 | 将json格式字符串转化为js对象 11 | JSON.parse('json格式字符串') 12 | eval('(' + json格式字符串 + ')') 13 | 安装: 14 | pip install jsonpath 15 | https://blog.csdn.net/luxideyao/article/details/77802389 16 | 17 | 与xpath的区别 18 | / $ 表示根元素 19 | . @ 当前元素 20 | / . or [] 子元素 21 | .. n/a 父元素 22 | // .. 递归下降,JSONPath是从E4X借鉴的。 23 | * * 通配符,表示所有的元素 24 | xpath下标从1开始,jsonpath从0开始 25 | --------------------------------------- 26 | @ n/a 属性访问字符 27 | [] [] 子元素操作符 28 | | [,] 连接操作符在XPath 结果合并其它结点集合。JSONP允许name或者数组索引。 29 | n/a [start:end:step] 数组分割操作从ES4借鉴。 30 | [] ?() 应用过滤表示式 31 | n/a () 脚本表达式,使用在脚本引擎下面。 32 | () n/a Xpath分组 33 | """ 34 | 35 | 36 | import json 37 | 38 | lt = [ 39 | {'name': '王宝强', 'age': 30}, 40 | {'name': 'pgone', 'age': 30}, 41 | {'name': '马蓉', 'age': 30}, 42 | {'name': '宋吉', 'age': 30}, 43 | {'name': '李小璐', 'age': 30}, 44 | ] 45 | # 将字典获列表转化为json的字符串 46 | string = json.dumps(lt) 47 | print(string) 48 | # out:[{"name": "\u738b\u5b9d\u5f3a", "age": 30}, 49 | # {"name": "pgone", "age": 30}, {"name": "\u9a6c\u84c9", "age": 30}, 50 | # {"name": "\u5b8b\u5409", "age": 30}, {"name": "\u674e\u5c0f\u7490", "age": 30}] 51 | 52 | import jsonpath 53 | 54 | # 将json格式文件转成python对象 55 | obj = json.load(open('exe_file/book.json','r',encoding='utf-8')) 56 | print(obj) 57 | 58 | # 书单所有书的作者 59 | ret = jsonpath.jsonpath(obj,'$.store.book[*].author') 60 | print(ret) 61 | # solve2 62 | ret1 = jsonpath.jsonpath(obj,'$..author') 63 | print(ret1) 64 | 65 | # 查找store下面所有的节点 66 | ret2 = jsonpath.jsonpath(obj,'$.store.*') 67 | print(ret2) 68 | 69 | # 查找store下面所有的price 70 | ret3 = jsonpath.jsonpath(obj,'$.store..price') 71 | print(ret3) 72 | 73 | # 查找第三个book 74 | ret4 = jsonpath.jsonpath(obj,'$..book[2]') 75 | print(ret4) 76 | 77 | # 查找最后一个book 78 | ret5 = jsonpath.jsonpath(obj,'$..book[(@.length-1)]') 79 | print(ret5) 80 | 81 | # 查找前两本书 82 | ret6 = jsonpath.jsonpath(obj,'$..book[0,1]') 83 | # ret6 = jsonpath.jsonpath(obj,'$..book[:2]') 84 | # ret6 = jsonpath.jsonpath(obj,'$..book')[:2] 85 | print(ret6) 86 | 87 | # 查找含有isbn键的book 88 | ret7 = jsonpath.jsonpath(obj,'$..book[?(@.isbn)]') 89 | print(ret7) 90 | 91 | #查找所有price键对应的值小于10的所有book 92 | ret8 = jsonpath.jsonpath(obj,'$..book[?(@.price<10)]') 93 | print(ret8) 94 | 95 | 96 | 97 | import urllib.request 98 | import urllib.response 99 | import jsonpath 100 | import csv 101 | """ 102 | https://fe-api.zhaopin.com/c/i/sou?start=180&pageSize=90& 103 | cityId=765&workExperience=-1&education=-1&companyType=-1& 104 | employmentType=-1&jobWelfareTag=-1&kw=python&kt=3 105 | """ 106 | 107 | def main(): 108 | 109 | # 创建csv文件 110 | csv_url = 'exe_file/python_postion.csv' 111 | fp = open(csv_url, 'wt', newline='', encoding='utf-8-sig') 112 | writer = csv.writer(fp) 113 | writer.writerow(('岗位', '企业名称', '企业规模', '企业类别', '企业主页', '工作地点', '薪酬', '学历要求', '工作经验', '岗位招聘主页')) 114 | 115 | # 智联招聘网址 116 | # kw表示职位关键字,cityId是城市代号 117 | # start和pageSize控制翻页 118 | url = 'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&' \ 119 | 'cityId=765&workExperience=-1&education=-1&companyType=-1&' \ 120 | 'employmentType=-1&jobWelfareTag=-1&kw=python&kt=3' 121 | # 请求头 122 | headers = { 123 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 124 | 'Accept-Language': 'zh-CN,zh;q=0.9', 125 | 'Cache-Control': 'max-age=0', 126 | 'Connection': 'keep-alive', 127 | 'Upgrade-Insecure-Requests': 1, 128 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' 129 | } 130 | # 请求 131 | request = urllib.request.Request(url=url, headers=headers) 132 | json_text = urllib.request.urlopen(request).read().decode() 133 | 134 | # 将请求到的json转为python对象 135 | json_obj = json.loads(json_text) 136 | # print(json_text) 137 | 138 | # 筛选出招聘职位信息集合 139 | pos_infos = jsonpath.jsonpath(json_obj,'$.data.results[*]') 140 | 141 | for info in pos_infos: 142 | # 基于jsonpath的信息查找 143 | job_name = jsonpath.jsonpath(info,'$.jobName')[0] 144 | company_name = jsonpath.jsonpath(info,'$.company.name')[0] 145 | company_size = jsonpath.jsonpath(info,'$.company.size.name')[0] 146 | company_type = jsonpath.jsonpath(info,'$.company.type.name')[0] 147 | company_url = jsonpath.jsonpath(info,'$.company.url')[0] 148 | city = jsonpath.jsonpath(info,'$..city.display')[0] 149 | salary = jsonpath.jsonpath(info,'$.salary')[0] 150 | edu_level = jsonpath.jsonpath(info,'$.eduLevel.name')[0] 151 | working_exp = jsonpath.jsonpath(info,'$.workingExp.name')[0] 152 | position_url = jsonpath.jsonpath(info,'$.positionURL')[0] 153 | 154 | writer.writerow((job_name, company_name, company_size, company_type, 155 | company_url, city, salary, edu_level, working_exp, position_url)) 156 | 157 | 158 | 159 | 160 | if __name__ == '__main__': 161 | main() 162 | -------------------------------------------------------------------------------- /9.1Chrome-headless.py: -------------------------------------------------------------------------------- 1 | """ 2 | PhantomJS 无界面浏览器 3 | selenium+phantomjs 爬虫解决方案 4 | 下拉滚动条到底部 5 | 豆瓣电影下拉 6 | 图片加载 7 | 图片懒加载问题 8 | 在下拉到底部后,对比获取的page1 和 page2 9 | 可以发现 src2全部变为src1 10 | 11 | """ 12 | from selenium import webdriver 13 | 14 | from selenium.webdriver.chrome.options import Options 15 | import time 16 | """ 17 | 使用pip show selenium显示默认安装的是3.1.3版本 18 | 目前使用新版selenium调用PhantomJS是会报这样的错: UserWarning: Selenium support for PhantomJS has been deprecated, 19 | please use headless versions of Chrome or Firefox instead warnings.warn('Selenium support for PhantomJS has been deprecated, please use headless 20 | 如果还想继续用PhantomJS的话只能使用旧版的selenium,卸载之后重新pip install selenium==2.48.0安装成功。 21 | 但其实更好的选择,我们可以使用firefox或chrome的headlesss模式,无需重装selenium 22 | 只需要添加以下代码: 23 | """ 24 | path = r'E:\Program Files\chrome-driver\chromedriver.exe' 25 | 26 | chrome_options = Options() 27 | chrome_options.add_argument('--headless') 28 | chrome_options.add_argument('--disable-gpu')#上面三行代码就是为了将Chrome不弹出界面,实现无界面爬取 29 | browser = webdriver.Chrome(path,options=chrome_options) 30 | 31 | """ 32 | url = 'http://www.baidu.com/' 33 | browser.get(url) 34 | time.sleep(2) 35 | browser.save_screenshot(r'exe_file/baidu.png') 36 | 37 | my_input = browser.find_element_by_id('kw') 38 | my_input.send_keys('美女') 39 | time.sleep(2) 40 | browser.save_screenshot('exe_file/meinv.png') 41 | 42 | button = browser.find_elements_by_class_name('s_btn')[0] 43 | button.click() 44 | time.sleep(2) 45 | browser.save_screenshot('exe_file/show.png') 46 | time.sleep(2) 47 | browser.quit() 48 | """ 49 | 50 | """ 51 | url = 'https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85%E7%89%87&type=13&interval_id=100:90&action=' 52 | 53 | browser.get(url) 54 | time.sleep(3) 55 | browser.save_screenshot('exe_file/douban.png') 56 | # 模拟滚动条滚动到底部 57 | # 不同,教程是用body,改用documentElement 58 | js = 'document.documentElement.scrollTop=10000' 59 | browser.execute_script(js) 60 | time.sleep(3) 61 | browser.save_screenshot('exe_file/douban_d.png') 62 | 63 | # 获取网页代码 64 | html = browser.page_source 65 | 66 | # 保存在文件中 67 | with open(r'exe_file/douban.html','w',encoding='utf-8') as f: 68 | f.write(html) 69 | ''' 70 | 豆瓣的数据是js动态加载的 71 | 两个方法可以获取数据: 72 | 1 直接获取请求接口 -推荐 73 | 2 利用浏览器驱动模拟真正浏览器获取数据,不过这个比较慢 74 | '''' 75 | browser.quit() 76 | 77 | """ 78 | 79 | url = 'http://sc.chinaz.com/tupian/' 80 | browser.get(url) 81 | time.sleep(5) 82 | with open(r'exe_file/szchina_page_1.html','w',encoding='utf-8') as fp: 83 | fp.write(browser.page_source) 84 | 85 | # 下拉到底部后再获取图片 86 | js = 'document.documentElement.scrollTop=10000' 87 | browser.execute_script(js) 88 | time.sleep(5) 89 | 90 | with open(r'exe_file/szchina_page_2.html','w',encoding='utf-8') as fp: 91 | fp.write(browser.page_source) 92 | browser.quit() -------------------------------------------------------------------------------- /9selenium.py: -------------------------------------------------------------------------------- 1 | """ 2 | 浏览器自动化测试框架 3 | 是一个python的第三方库,对外提供接口可以操作浏览器 4 | 让浏览器完成自动化操作 5 | 使用selenium 6 | 1 安装 pip install selenium 7 | 2 如何操作谷歌浏览器,首先必须有谷歌浏览器的一个驱动 8 | 3 9 | 驱动与谷歌浏览器的版本映射关系 10 | https://blog.csdn.net/fox990152806/article/details/91881361 11 | 谷歌驱动下载 12 | http://npm.taobao.org/mirrors/chromedriver/ 13 | 4 代码操作 14 | find_element_by_id id 15 | find_element_by_name name 16 | find_element_by_xpath xpath 17 | find_element_by_tag_name 标签名 18 | find_element_by_class_name class名称 19 | find_element_by_css_selector 选择器查找 20 | find_element_by_link_text 根据链接内容 21 | 22 | get\set_keys\click 23 | 24 | """ 25 | 26 | # 简单selenium操作 27 | from selenium import webdriver 28 | import time 29 | 30 | # 模拟创建浏览器对象,通过对象操作浏览器 31 | path = r'E:\Program Files\chrome-driver\chromedriver.exe' 32 | browser = webdriver.Chrome(executable_path=path) 33 | # print(browser) 34 | 35 | # 打开百度 36 | url = 'http://www.baidu.com/' 37 | browser.get(url) 38 | 39 | # 中间有内容请求,发送响应的过程,需要停顿 40 | time.sleep(2) 41 | 42 | # 向百度搜索框中填入关键字 43 | my_input = browser.find_element_by_id('kw') # 对应百度搜索框的id 44 | my_input.send_keys('美女') 45 | 46 | time.sleep(2) 47 | # 查找搜索按钮s 48 | # ..s:返回一个列表 49 | # bg s_btn 不行 50 | button = browser.find_elements_by_class_name('s_btn')[0] 51 | button.click() # 点击 52 | # 页面停顿 53 | time.sleep(2) 54 | 55 | 56 | # 坑:百度已将该图片链接设置为动态,故无法点击 57 | page_url = browser.find_elements_by_class_name('op-img-address-hover')[0] 58 | page_url.click() 59 | 60 | time.sleep(5) 61 | 62 | # 关闭浏览器 63 | browser.quit() 64 | 65 | 66 | from selenium import webdriver 67 | import time 68 | 69 | # 模拟创建浏览器对象,通过对象操作浏览器 70 | path = r'E:\Program Files\chrome-driver\chromedriver.exe' 71 | browser = webdriver.Chrome(executable_path=path) 72 | # print(browser) 73 | 74 | # 打开百度 75 | url = 'http://www.baidu.com/' 76 | browser.get(url) 77 | 78 | browser.find_elements_by_link_text('设置')[0].click() 79 | time.sleep(3) 80 | 81 | browser.find_elements_by_link_text(r'搜索设置')[0].click() 82 | time.sleep(2) 83 | 84 | m = browser.find_element_by_id('nr') 85 | time.sleep(2) 86 | 87 | # 每页搜索50条 88 | m.find_element_by_xpath('//*[@id="nr"]/option[3]').click() 89 | time.sleep(2) 90 | 91 | # 确认更改 92 | browser.find_elements_by_class_name("prefpanelgo")[0].click() 93 | time.sleep(2) 94 | 95 | # 处理弹窗 96 | browser.switch_to_alert().accept() 97 | time.sleep(2) 98 | 99 | # 进行搜索 100 | browser.find_element_by_id('kw').send_keys('美女') 101 | time.sleep(2) 102 | 103 | # 确认 104 | browser.find_element_by_id('su').click() 105 | time.sleep(2) 106 | 107 | # 进入该搜索项 108 | browser.find_elements_by_link_text('美女_百度图片')[0].click() 109 | time.sleep(3) 110 | 111 | 112 | # 关闭浏览器 113 | browser.quit() 114 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pythonCrawler 2 | [![HitCount](https://hits.b3log.org/ZhuoZhuoCrayon/pythonCrawler.svg)](https://github.com/ZhuoZhuoCrayon/pythonCrawler/) 3 | >## Notice 4 | 1. exe_file 是本程序爬取的附录,全部测试、实战读写路径全部指向exe\_file 5 | 2. 本爬虫笔记基于b站 [Python爬虫从入门到高级实战【92集】千锋Python高级教程](https://www.bilibili.comvideo/av37027372) 6 | 3. 在该教程的基础上对教程中的思路进行实践,对教程出现的错误进行修正,并且另外扩展,**并非教程源码照搬** 7 | 4. 由于时间有限,笔记与代码都位于.py文件中,以注释及代码形式存在,对学习过程中会出现的bug以及难点进行分析 8 | 5. 由于作者能力有限以及爬虫技术迭代速度快,代码可能会存在bug,如有此情况,欢迎联系我更正或者pull request 9 | 6. **更新日志的正确打开方式:** 10 | - 数字代表每一章,每个数字的第一个py文件是基础知识讲解及简单实践 11 | - x.x形式的py文件一般是实战内容 12 | - 例如6.基于xpath...是基础知识,那么6.1就是项目实战内容 13 | - **所有的py文件都会有思路、踩坑以及知识点的介绍** 14 | - **人性化设置,md文件的更新日志附属笔记的超链接跳转** 15 | 7. 如果笔记对您有用,麻烦Star谢谢 16 | - - - 17 | >## Update log 18 | 1. __2019/03-2019/03/12__ 19 | - [1.urllib基础](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/1urllib_base.py) 20 | - [2.利用ajax的特点构建post请求,及对url异常的处理实例:豆瓣,kfc餐厅,百度贴吧的页面爬取](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/2ajax.py) 21 | - [3.以百度翻译为例介绍fiddler中json包的解析](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/fillder.py) 22 | - [4.Handler处理器的应用:设置ip及cookieJar,人人网模拟登陆](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/4handler.py) 23 | - [5.1.利用正则表达式提取糗图网页面信息](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/5.1%E6%AD%A3%E5%88%99%E7%88%AC%E5%8F%96%E7%B3%97.py) 24 | - [5.2.正则爬取励志网并建立文章集合页面](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/5.2%E6%AD%A3%E5%88%99%E7%88%AC%E5%8F%96%E5%8A%B1%E5%BF%97%E7%BD%91%E5%B9%B6%E5%BB%BA%E7%AB%8B%E6%96%87%E7%AB%A0%E9%9B%86%E5%90%88%E9%A1%B5%E9%9D%A2.py) 25 | 2. __2019/04-__ 26 | - 项目实战:[智联招聘爬虫-通用版:目前已爬取2019年第一季度IT领域招聘信息数据集](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/zhilianCrawler.py) 27 | + urllib, BeautifulSoup, 正则表达式, 多线程爬取, json获取, csv文件读写 28 | 3. __2019/07/10__ 29 | - [6.基于xpath的html页面信息提取](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/6xpath.py) 30 | + 实例:段子网爬取 31 | 4. __2019/07/11__ 32 | - [6.1.读取文件中的列表格式](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/6.1read_list.py) 33 | + 实例:文本文件中对象的读取 34 | - [7.基于图片懒加载技术的图片下载](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/7pictureLoad.py) 35 | 5. __2019/07/15__ 36 | - [8.基于jsonpath的json文件解析方法](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/8jsonpath.py) 37 | + 实例:智联招聘,填补之前智联爬虫采用正则表达式解析json文件的繁琐方法 38 | + b站教程以爬取淘宝评论为例,但现淘宝系统过于难爬,**此处留坑** 39 | 6. __2019/07/16__ 40 | - 谷歌浏览器驱动,适配谷歌75版本---在exeFile目录下 41 | 7. __2019/07/17__ 42 | - [9.基于selenium的浏览器控制访问](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/9selenium.py) 43 | + 实例:百度关键字搜索 44 | 8. __2019/07/19__ 45 | - [9.1.基于Chrome无界面模式浏览,图片懒加载的特点,异步加载的解决方法](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/9.1Chrome-headless.py) 46 | + 实例1:豆瓣电影下拉滚动条,懒加载变化解析 47 | + 实例2:百度图片搜索,无界面模式实践 48 | 9. __2019/07/20__ 49 | - **告知:** 50 | + 为方便实例的各种测试文件的查找,在第10章包括以后,每章的测试文件保存在exe\_file/x/下 51 | + **x为对应章节,例如第10章,则位于exe\_file/10/** 52 | - [10.Requests库的基本用法](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/10-Requests.py) 53 | + 实例:百度搜索,必应翻译,登陆人人网为例介绍post、cookie、get的用法 54 | + 代理使用 55 | - [10.1.Requests库实战](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/10.1busPath_Crawler.py) 56 | + 实例:爬取深圳所有公交路线 57 | + 运用:json文件读写、Requests库及xpath解析 58 | + 数据集:[深圳公交线路json文件](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/exe_file/10/bus_line.json) 59 | - [11.验证码登陆方式](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/11verification_code.py) 60 | + 实例:利用返回验证码到本地的方法登陆古诗文网 61 | + 运用:Requests库(创建会话用于支持cookie),美味汤(beautifulSoup) 62 | 10. __2019/07/21-2019/07/26__ 63 | - [11.1pytesser介绍](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/11.1pytesser.py) 64 | + 介绍了pytesser库以及PIL库的基本使用 65 | - [11.2jTessBoxEditor-tesseract字库训练模式](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/11.2jTessBoxEditor-tesseract.py) 66 | + 验证码测试脚本 67 | - **[重点:tesseract训练字库详解](https://github.com/ZhuoZhuoCrayon/pythonCrawler/tree/master/tesseract%E8%AE%AD%E7%BB%83%E6%A8%A1%E5%9E%8B)** 68 | + 通过建立特征字符库,逐层加入识别错误的验证码进行补充训练,可以在三次扩充样本训练后达到90%以上识别率 69 | 11. __2019/07/28__ 70 | - [12.视频爬取](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/12video.py) 71 | + 基于xpath, json, chromeDrive-headless的视频爬取方案 72 | 12. __2019/07/29-2019/07/31__ 73 | - [13.多线程基础汇总](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/13multiThread.py) 74 | - [13.1多线程的面向对象构造形式](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/13.1thread_ood.py) 75 | - [13.2队列的基本Queue的基本操作](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/13.2thread_queue.py) 76 | - [13.3多线程爬取深圳公交线路](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/13.3Mthread_crawler.py) 77 | + 基于10.1的程序进行多线程重构 78 | + 多线程爬取速度提升至500% 79 | 13. __2019/03-2019/05__ 80 | - [实战:58同城租房价格爬取](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/58crawler/58decode.py) 81 | + 涉及反爬策略,关于编码转化的技巧 82 | - [实战:中国大学排名爬取](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/chineseUniversityRankCrawler/RankofNuni.py) 83 | + 美味汤、requests库的使用 84 | - [实战:美桌网图片爬取实例4则](https://github.com/ZhuoZhuoCrayon/pythonCrawler/tree/master/pictureCrawler) 85 | + 入门级别 86 | + 实践多线程、美味汤、requests库 87 | --- 88 | >## Contributing 89 | >如果你对这个项目感兴趣,非常乐意你可以将.py文件的笔记和代码进行格式加工 90 | >>[版权声明]笔记内容是我原创并且开源到github上的,所有内容仅限于学习,不作商用,欢迎star/download/fork,但务必遵守相关开源协议进行使用,原创不易,请勿copy。在实践时遵守爬虫协议,目的只是为了更好的掌握爬虫知识,如果有所影响,请联系我删除,谢谢! 91 | 92 | -------------------------------------------------------------------------------- /chineseUniversityRankCrawler/RankofNuni.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import bs4 4 | 5 | def getHTMLText(url): 6 | try: 7 | r = requests.get(url,timeout = 30) 8 | r.raise_for_status() 9 | r.encoding = r.apparent_encoding 10 | return r.text 11 | except: 12 | print("getError") 13 | return "" 14 | return "" 15 | 16 | def fillUnivList(ulist,html): 17 | soup = BeautifulSoup(html,"html.parser") 18 | """ 19 | 结构分析 20 | 排名位于tbody标签下 21 | 每个tr标签是一个学校的信息 22 | tr标签下有多个td标签,保存有该学校的各类指标 23 | """ 24 | # 遍历tr标签 tr标签是tbody标签的孩子 25 | for tr in soup.find('tbody').children: 26 | print(tr) 27 | if isinstance(tr,bs4.element.Tag): 28 | # 获取tr标签下的td标签 29 | tds = tr('td') 30 | # 获取相关指标 只需要第 0 1 3 个相关td标签,分别是学校名称,排名,分数 31 | ulist.append([tds[0].string,tds[1].string,tds[3].string]) 32 | 33 | # 打印前20的榜单 34 | def printUnivList(ulist,num): 35 | """ 36 | print("{:^10}\t{:^6}\t{:^10}".format("排名","学校名称","分数")) 37 | for i in range(num): 38 | u = ulist[i] 39 | print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2])) 40 | """ 41 | 42 | # 优化,解决中文不对齐问题 43 | #^num num代表占位 44 | tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}" 45 | # chr(12288)是中文空白符 46 | print(tplt.format("排名","学校名称","分数",chr(12288))) 47 | for i in range(num): 48 | u = ulist[i] 49 | print(tplt.format(u[0],u[1],u[2],chr(12288))) 50 | def main(): 51 | unifo = [] 52 | url = "http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html" 53 | html = getHTMLText(url) 54 | fillUnivList(unifo,html) 55 | printUnivList(unifo,20) 56 | 57 | main() 58 | -------------------------------------------------------------------------------- /exe_file/10/chinaunix_login.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 52 | 53 | 54 |
    55 |
    56 |
    57 |

    Whoops, looks like something went wrong.

    58 |
    59 |
    60 |
    61 |
    62 | 63 |
    64 | 65 |
    66 | 67 | -------------------------------------------------------------------------------- /exe_file/11/code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/code.png -------------------------------------------------------------------------------- /exe_file/11/code1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/code1.png -------------------------------------------------------------------------------- /exe_file/11/code_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/code_2.png -------------------------------------------------------------------------------- /exe_file/11/gushiwen_code/gu.bat: -------------------------------------------------------------------------------- 1 | cd C:\Users\crayon\OneDrive\Pycode\Crawler\crawler_Basic\exe_file\11\gushiwen_code 2 | 3 | echo font 0 0 0 0 0>font_properties 4 | 5 | echo Run Tesseract for Training.. 6 | tesseract.exe --psm 10 gu.font.exp0.tif gu.font.exp0 nobatch box.train 7 | 8 | 9 | 10 | echo Compute the Character Set.. 11 | unicharset_extractor.exe gu.font.exp0.box 12 | 13 | 14 | mftraining -F font_properties -U unicharset -O gu.unicharset gu.font.exp0.tr 15 | 16 | echo Clustering.. 17 | cntraining.exe gu.font.exp0.tr 18 | 19 | echo Rename Files.. 20 | 21 | 22 | rename normproto gu.normproto 23 | 24 | rename inttemp gu.inttemp 25 | 26 | 27 | rename pffmtable gu.pffmtable 28 | 29 | 30 | rename shapetable gu.shapetable 31 | 32 | 33 | 34 | echo Create Tessdata.. 35 | 36 | 37 | combine_tessdata.exe gu. 38 | -------------------------------------------------------------------------------- /exe_file/11/gushiwen_code/gu.traineddata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/gushiwen_code/gu.traineddata -------------------------------------------------------------------------------- /exe_file/11/gushiwen_code/train_toBox.bat: -------------------------------------------------------------------------------- 1 | cd C:\Users\crayon\OneDrive\Pycode\Crawler\crawler_Basic\exe_file\11\gushiwen_code 2 | 3 | tesseract gu.font.exp0.tif gu.font.exp0 -l gu --psm 7 batch.nochop makebox -------------------------------------------------------------------------------- /exe_file/11/gushiwen_code/第一轮训练.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/gushiwen_code/第一轮训练.rar -------------------------------------------------------------------------------- /exe_file/11/gushiwen_code/第三轮训练.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/gushiwen_code/第三轮训练.rar -------------------------------------------------------------------------------- /exe_file/11/gushiwen_code/第二轮训练.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/gushiwen_code/第二轮训练.rar -------------------------------------------------------------------------------- /exe_file/11/gushiwen_code/第四轮训练.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/gushiwen_code/第四轮训练.rar -------------------------------------------------------------------------------- /exe_file/11/test/0-9A-Z训练字典/gu.traineddata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/test/0-9A-Z训练字典/gu.traineddata -------------------------------------------------------------------------------- /exe_file/11/verify_code/verify_code.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/verify_code/verify_code.rar -------------------------------------------------------------------------------- /exe_file/12/download/test.txt: -------------------------------------------------------------------------------- 1 | test file -------------------------------------------------------------------------------- /exe_file/baidu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/baidu.png -------------------------------------------------------------------------------- /exe_file/book.json: -------------------------------------------------------------------------------- 1 | { "store": { 2 | "book": [ 3 | { "category": "文学", 4 | "author": "路遥", 5 | "title": "平凡的世界", 6 | "price": 8.95 7 | }, 8 | { "category": "文学", 9 | "author": "席慕容", 10 | "title": "穆斯林的赞礼", 11 | "price": 12.99 12 | }, 13 | { "category": "历史", 14 | "author": "二月河", 15 | "title": "康熙大帝", 16 | "isbn": "0-553-21311-3", 17 | "price": 8.99 18 | }, 19 | { "category": "言情", 20 | "author": "琼瑶", 21 | "title": "The Lord of the Rings", 22 | "isbn": "0-395-19395-8", 23 | "price": 22.99 24 | } 25 | ], 26 | "bicycle": { 27 | "color": "red", 28 | "price": 19.95 29 | } 30 | } 31 | } -------------------------------------------------------------------------------- /exe_file/chrome-driver/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/chrome-driver/chromedriver.exe -------------------------------------------------------------------------------- /exe_file/douban.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/douban.png -------------------------------------------------------------------------------- /exe_file/douban_d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/douban_d.png -------------------------------------------------------------------------------- /exe_file/hello.txt: -------------------------------------------------------------------------------- 1 | [{'name':'宫本武藏\n小田纯一郎'}] -------------------------------------------------------------------------------- /exe_file/meinv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/meinv.png -------------------------------------------------------------------------------- /exe_file/python_postion.csv: -------------------------------------------------------------------------------- 1 | 岗位,企业名称,企业规模,企业类别,企业主页,工作地点,薪酬,学历要求,工作经验,岗位招聘主页 2 | python工程师,深圳市博奥特科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ298487210.htm,深圳-罗湖区,15K-20K,本科,3-5年,https://jobs.zhaopin.com/CC298487217J00237350002.htm 3 | 急聘:Python开发助理/包住/项目奖金,深圳市宇达计算机有限公司,100-499人,外商独资,https://company.zhaopin.com/CZ644104880.htm,深圳,3.5K-7K,学历不限,不限,https://jobs.zhaopin.com/CC644104880J00368828601.htm 4 | 高级后端研发工程师(node/python),深圳市中联创新自控系统有限公司,100-499人,民营,https://company.zhaopin.com/CZ157447310.htm,深圳-龙岗区,18K-30K,本科,3-5年,https://jobs.zhaopin.com/CC157447311J00258918704.htm 5 | Python开发,深圳市德科信息技术有限公司,1000-9999人,股份制企业,https://company.zhaopin.com/CZ589380620.htm,深圳-福田区,10K-15K,本科,3-5年,https://jobs.zhaopin.com/CC589380625J00315155807.htm 6 | Python后端开发,深圳市金鑫云端科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ816720960.htm,深圳,15K-20K,大专,3-5年,https://jobs.zhaopin.com/CC816720960J00396160401.htm 7 | STEAM教育/Python编程老师,深圳格趣创新科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ609854230.htm,深圳,7K-10K,大专,无经验,https://jobs.zhaopin.com/CC609854230J00326990907.htm 8 | 急招Python开发助理/实习生,深圳市十七大道科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ644079680.htm,深圳-南山区,3K-5K,大专,无经验,https://jobs.zhaopin.com/CC644079685J00215102201.htm 9 | python,深圳市乐易网络股份有限公司,100-499人,民营,https://company.zhaopin.com/CZ495056820.htm,深圳,15K-25K,本科,3-5年,https://jobs.zhaopin.com/CC495056821J00261237003.htm 10 | python,深圳德科共赢创投合伙企业(有限合伙),不限,民营,https://company.zhaopin.com/CZ868439670.htm,深圳-南山区,10K-15K,大专,3-5年,https://jobs.zhaopin.com/CC868439670J00255308908.htm 11 | python工程师,叠云(北京)科技股份有限公司,20-99人,上市公司,https://company.zhaopin.com/CZ394261830.htm,深圳,10K-15K,本科,3-5年,https://jobs.zhaopin.com/CC394261836J00117878715.htm 12 | python开发工程师,深圳市乐易网络股份有限公司,100-499人,民营,https://company.zhaopin.com/CZ495056820.htm,深圳,10K-20K,本科,1-3年,https://jobs.zhaopin.com/CC495056821J00261241003.htm 13 | python开发工程师,深圳市芒柠科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ466965980.htm,深圳,10K-15K,大专,1-3年,https://jobs.zhaopin.com/CC466965980J00235518307.htm 14 | python,深圳国开创新科技有限公司,500-999人,民营,https://company.zhaopin.com/CZ690553130.htm,深圳-南山区,9K-15K,大专,3-5年,https://jobs.zhaopin.com/CC690553137J00209624102.htm 15 | python后台开发工程师,深圳市悦动天下科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ644924620.htm,深圳,15K-30K,本科,3-5年,https://jobs.zhaopin.com/644924620250125.htm 16 | python工程师,深圳市纽尔科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ571842520.htm,深圳-罗湖区,15K-30K,本科,3-5年,https://jobs.zhaopin.com/CC571842520J00304504205.htm 17 | Python开发工程师,深圳市欧恩德技术有限公司,100-499人,民营,https://company.zhaopin.com/CZ672784720.htm,深圳-南山区,8K-10K,大专,1-3年,https://jobs.zhaopin.com/CC672784724J00396219401.htm 18 | Python开发工程师(深圳),深圳市前海谷雨网络科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ544024680.htm,深圳-南山区,8K-15K,大专,1-3年,https://jobs.zhaopin.com/544024686250018.htm 19 | python高级开发工程师,华南城集团有限公司,1000-9999人,外商独资,https://company.zhaopin.com/CZ502416520.htm,深圳-龙岗区,15K-25K,本科,5-10年,https://jobs.zhaopin.com/CC502416523J00382948305.htm 20 | Python开发(后海),深圳市德科信息技术有限公司,1000-9999人,股份制企业,https://company.zhaopin.com/CZ589380620.htm,深圳,10K-15K,大专,3-5年,https://jobs.zhaopin.com/CC589380625J00318725207.htm 21 | Python开发工程师(深圳),深圳市八斗才数据有限公司,20-99人,民营,https://company.zhaopin.com/CZ562550030.htm,深圳-南山区,10K-15K,大专,1-3年,https://jobs.zhaopin.com/562550032250031.htm 22 | python开发工程师(深圳),深圳市源极光科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ562114530.htm,深圳-南山区,10K-15K,大专,1-3年,https://jobs.zhaopin.com/562114535250038.htm 23 | 高级Python软件工程师,August Robotics Limited,20人以下,外商独资,https://company.zhaopin.com/CZ716976740.htm,深圳,30K-50K,本科,3-5年,https://jobs.zhaopin.com/CZ716976740J00350193203.htm 24 | python,深圳德科共赢创投合伙企业(有限合伙),不限,民营,https://company.zhaopin.com/CZ868439670.htm,深圳-南山区,10K-15K,大专,3-5年,https://jobs.zhaopin.com/CC868439670J00257258908.htm 25 | python开发工程师,赞同科技股份有限公司,1000-9999人,民营,https://company.zhaopin.com/CZ539946580.htm,深圳-福田区,6K-12K,本科,不限,https://jobs.zhaopin.com/CC539946581J00290691103.htm 26 | 软件开发工程师(Python),德硕管理咨询(深圳)有限公司,20-99人,外商独资,https://company.zhaopin.com/CZ154580510.htm,深圳-福田区,8K-15K,大专,1-3年,https://jobs.zhaopin.com/CC154580515J00240303004.htm 27 | Python数据挖掘,深圳前海招文天下金融服务有限公司,不限,民营,https://company.zhaopin.com/CZ815392880.htm,深圳-南山区,11K-18K,本科,3-5年,https://jobs.zhaopin.com/CC815392880J00406708501.htm 28 | 高级Python后端工程师,深圳德聚企业管理咨询有限公司,100-499人,民营,https://company.zhaopin.com/CZ413479080.htm,深圳-南山区,20K-40K,本科,3-5年,https://jobs.zhaopin.com/413479081250105.htm 29 | Python开发工程师,深圳市芒柠科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ466965980.htm,深圳-宝安区,10K-15K,大专,1-3年,https://jobs.zhaopin.com/466965980250155.htm 30 | Python工程师,深圳市凹凸微科科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ238749130.htm,深圳,15K-20K,本科,1-3年,https://jobs.zhaopin.com/CC238749137J00417101501.htm 31 | Python软件开发(应届生),德硕管理咨询(深圳)有限公司,20-99人,外商独资,https://company.zhaopin.com/CZ154580510.htm,深圳-福田区,6K-8K,本科,无经验,https://jobs.zhaopin.com/CC154580515J00254012304.htm 32 | python开发工程师,深圳市大富科技股份有限公司,1000-9999人,上市公司,https://company.zhaopin.com/CZ133833090.htm,深圳-福田区,10K-15K,本科,1-3年,https://jobs.zhaopin.com/CC133833090J00215320006.htm 33 | 日语Java/Python开发工程师,Uniqsys优尼卡日本株式会社,20-99人,外商独资,https://company.zhaopin.com/CZ842910190.htm,深圳,15K-30K,本科,不限,https://jobs.zhaopin.com/CC842910190J00176413904.htm 34 | python开发工程师,广州市欢雀科技有限公司,100-499人,民营,http://special.zhaopin.com/pagepublish/41297738/index.html,深圳-南山区,12K-16K,本科,3-5年,https://jobs.zhaopin.com/CC412977388J00340584403.htm 35 | Python开发工程师,深圳市掌世界网络科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ253935310.htm,深圳-罗湖区,8K-15K,本科,3-5年,https://jobs.zhaopin.com/253935313250030.htm 36 | Python后端开发工程师,北京国双科技有限公司,1000-9999人,上市公司,https://company.zhaopin.com/CZ147278820.htm,深圳-福田区,10K-20K,本科,1-3年,https://jobs.zhaopin.com/CC147278824J00166336110.htm 37 | python工程师(区块链),北京天盛京享科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ839864870.htm,深圳,15K-30K,本科,3-5年,https://jobs.zhaopin.com/CC839864870J00177422813.htm 38 | 日语Java/Python开发工程师-初级,Uniqsys优尼卡日本株式会社,20-99人,外商独资,https://company.zhaopin.com/CZ842910190.htm,深圳,5K-10K,本科,不限,https://jobs.zhaopin.com/CC842910190J00180734404.htm 39 | Python后台开发工程师,广州宏鸿钥环境科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ644239030.htm,深圳-宝安区,25K-30K,本科,3-5年,https://jobs.zhaopin.com/CZ644239030J00261748208.htm 40 | Python开发,深圳德聚企业管理咨询有限公司,100-499人,民营,https://company.zhaopin.com/CZ413479080.htm,深圳,15K-30K,大专,3-5年,https://jobs.zhaopin.com/413479081250149.htm 41 | 高级python,深圳德聚企业管理咨询有限公司,100-499人,民营,https://company.zhaopin.com/CZ413479080.htm,深圳-南山区,20K-40K,本科,不限,https://jobs.zhaopin.com/413479081250103.htm 42 | Python运维开发中级讲师,北京千锋互联科技有限公司深圳分公司,1000-9999人,民营,https://company.zhaopin.com/CZ556809120.htm,深圳-宝安区,18K-30K,本科,3-5年,https://jobs.zhaopin.com/CC556809127J00373127101.htm 43 | python工程师,北京嘉连勤科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ475451320.htm,深圳-南山区,8K-15K,学历不限,1-3年,https://jobs.zhaopin.com/CC475451320J00131518714.htm 44 | PHP/Python软件工程师,深圳市蜂窝网络有限公司,20-99人,民营,https://company.zhaopin.com/CZ517946720.htm,深圳,10K-15K,本科,1-3年,https://jobs.zhaopin.com/CC517946724J00320176301.htm 45 | 急聘Python开发工程师 ,深圳神州讯盟软件有限公司,20人以下,民营,https://company.zhaopin.com/CZ248856010.htm,深圳-龙华新区,8K-10K,本科,1-3年,https://jobs.zhaopin.com/CC248856017J00292840102.htm 46 | "日语Java,C,Python,Sap,DB软件开发工程师",Uniqsys优尼卡日本株式会社,20-99人,外商独资,https://company.zhaopin.com/CZ842910190.htm,深圳,20K-30K,本科,不限,https://jobs.zhaopin.com/CC842910190J00173666004.htm 47 | python高级工程师,深圳市美铁科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ711331820.htm,深圳,15K-20K,本科,1-3年,https://jobs.zhaopin.com/CC711331825J00332456307.htm 48 | Python开发,前海泰坦科技(深圳)有限公司,20-99人,民营,https://company.zhaopin.com/CZ739677640.htm,深圳-福田区,10K-15K,本科,3-5年,https://jobs.zhaopin.com/CZ739677640J00416464201.htm 49 | python中级开发工程师,深圳新河通创科技有限公司,500-999人,其它,https://company.zhaopin.com/CZ897911160.htm,深圳-福田区,10K-15K,大专,3-5年,https://jobs.zhaopin.com/CZ897911160J00205978405.htm 50 | python web工程师,深圳市南伽科技信息有限公司,100-499人,民营,https://company.zhaopin.com/CZ616068220.htm,深圳,8K-16K,本科,1-3年,https://jobs.zhaopin.com/616068226250060.htm 51 | python高级开发工程师,深圳市南伽科技信息有限公司,100-499人,民营,https://company.zhaopin.com/CZ616068220.htm,深圳,9K-15K,本科,3-5年,https://jobs.zhaopin.com/CC616068226J00057649707.htm 52 | python工程师,杭州同帆科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ512025020.htm,深圳-南山区,10K-15K,本科,1-3年,https://jobs.zhaopin.com/CC512025027J00287262707.htm 53 | python初级开发工程师,深圳市神州动力数码有限公司,100-499人,民营,https://company.zhaopin.com/CZ145341820.htm,深圳,4K-8K,大专,不限,https://jobs.zhaopin.com/CC145341829J00267045107.htm 54 | 协议分析师(python爬虫方向),尼尔林技术咨询(深圳)有限公司,20-99人,外商独资,https://company.zhaopin.com/CZ328908510.htm,深圳-南山区,15K-25K,本科,不限,https://jobs.zhaopin.com/CC328908511J00229100104.htm 55 | python开发,深圳市南伽科技信息有限公司,100-499人,民营,https://company.zhaopin.com/CZ616068220.htm,深圳,7K-14K,本科,1-3年,https://jobs.zhaopin.com/CC616068226J00127137107.htm 56 | Python工程师,深圳市南伽科技信息有限公司,100-499人,民营,https://company.zhaopin.com/CZ616068220.htm,深圳,10K-20K,学历不限,3-5年,https://jobs.zhaopin.com/616068226250044.htm 57 | Python工程师,深圳市神经云网络科技有限公司,100-499人,外商独资,https://company.zhaopin.com/CZ578097080.htm,深圳,15K-30K,大专,1-3年,https://jobs.zhaopin.com/CC578097081J00198102905.htm 58 | python开发工程师,深圳市瑞驰信息技术有限公司,100-499人,民营,https://company.zhaopin.com/CZ298519280.htm,深圳-南山区,10K-20K,大专,3-5年,https://jobs.zhaopin.com/298519281250039.htm 59 | python后台开发工程师,深圳市美鸿电子有限公司,20-99人,民营,https://company.zhaopin.com/CZ718489740.htm,深圳,16K-22K,本科,3-5年,https://jobs.zhaopin.com/CZ718489740J00373815407.htm 60 | python数据分析,深圳市美鸿电子有限公司,20-99人,民营,https://company.zhaopin.com/CZ718489740.htm,深圳,15K-20K,本科,3-5年,https://jobs.zhaopin.com/CZ718489740J00373817807.htm 61 | 服务器软件开发工程师(Python方向),深圳市数联信息科技有限公司,20-99人,不限,https://company.zhaopin.com/CZ823276440.htm,深圳-南山区,15K-20K,本科,3-5年,https://jobs.zhaopin.com/CZ823276440J00393859505.htm 62 | python开发工程师,深圳市比一比网络科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ572165720.htm,深圳,10K-20K,大专,3-5年,https://jobs.zhaopin.com/CC572165726J00394238905.htm 63 | python工程师,深圳市比一比网络科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ572165720.htm,深圳,10K-20K,大专,3-5年,https://jobs.zhaopin.com/CC572165726J00394237005.htm 64 | python&Flask 微信公众号软件开发兼职,深圳市薄荷阅读科技有限公司,20人以下,民营,https://company.zhaopin.com/CZ632943280.htm,深圳,4K-6K,学历不限,不限,https://jobs.zhaopin.com/CZ632943280J00174588601.htm 65 | python量化开发工程师,深圳矩心科技有限公司,20人以下,股份制企业,https://company.zhaopin.com/CZ874879190.htm,深圳,8K-13K,大专,3-5年,https://jobs.zhaopin.com/CC874879190J00280523208.htm 66 | 视觉及深度学习算法工程师,深圳飞科机器人有限公司,20-99人,不限,https://company.zhaopin.com/CZ701390880.htm,深圳,15K-25K,本科,3-5年,https://jobs.zhaopin.com/CZ701390880J00421223601.htm 67 | 乐高研发老师,深圳百智教育科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ841629950.htm,深圳,8K-15K,大专,1-3年,https://jobs.zhaopin.com/CC841629950J00283614304.htm 68 | 数据库管理处数据库管理员(J10017),富德保险控股股份有限公司,10000人以上,股份制企业,https://company.zhaopin.com/CZ298522580.htm,深圳,8K-15K,本科,不限,https://jobs.zhaopin.com/CC298522587J00276619903.htm 69 | 软件测试岗,富德保险控股股份有限公司,10000人以上,股份制企业,https://company.zhaopin.com/CZ298522580.htm,深圳,10K-15K,本科,3-5年,https://jobs.zhaopin.com/298522587250081.htm 70 | 工程师助理 视频录制剪辑 文档工程师,深圳市微雪电子有限公司,20-99人,民营,http://special.zhaopin.com/2018/shz/11test/szsw090518,深圳,5K-8K,大专,不限,https://jobs.zhaopin.com/CC401347719J00266136404.htm 71 | 运维工程师,北京魅动力教育咨询有限公司,100-499人,民营,https://company.zhaopin.com/CZ487000820.htm,深圳-南山区,12K-20K,本科,3-5年,https://jobs.zhaopin.com/CC487000829J00172325513.htm 72 | 大数据应用部资深数据应用研发岗,富德保险控股股份有限公司,10000人以上,股份制企业,https://company.zhaopin.com/CZ298522580.htm,深圳,9K-18K,本科,3-5年,https://jobs.zhaopin.com/298522587250124.htm 73 | 数据分析专员,深圳市赛益莱特科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ814100400.htm,深圳-南山区,5K-8K,大专,不限,https://jobs.zhaopin.com/CC814100400J00201589901.htm 74 | 服务器管理员岗,富德保险控股股份有限公司,10000人以上,股份制企业,https://company.zhaopin.com/CZ298522580.htm,深圳,9K-15K,本科,3-5年,https://jobs.zhaopin.com/CC298522587J00300066803.htm 75 | 软件测试工程师(招商金科),深圳市博悦科创科技有限公司,500-999人,民营,https://company.zhaopin.com/CZ435485810.htm,深圳-福田区,10K-15K,本科,1-3年,https://jobs.zhaopin.com/CC435485814J00249600106.htm 76 | RD2-测试工程师,深圳银澎云计算有限公司,500-999人,股份制企业,https://company.zhaopin.com/CZ609532620.htm,深圳,10K-15K,本科,3-5年,https://jobs.zhaopin.com/CC609532620J00242978907.htm 77 | RD2-测试工程师(服务器),深圳银澎云计算有限公司,500-999人,股份制企业,https://company.zhaopin.com/CZ609532620.htm,深圳,10K-15K,本科,3-5年,https://jobs.zhaopin.com/CC609532620J00246531807.htm 78 | c++开发工程师,深圳市新类型科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ826945830.htm,深圳,15K-30K,本科,5-10年,https://jobs.zhaopin.com/CZ826945830J00295046108.htm 79 | 测试工程师中级,小欧科技(珠海市)有限责任公司,20-99人,民营,https://company.zhaopin.com/CZ880922430.htm,深圳,10K-15K,大专,3-5年,https://jobs.zhaopin.com/CC880922430J00391273205.htm 80 | RD3-测试工程师(音视频测试),深圳银澎云计算有限公司,500-999人,股份制企业,https://company.zhaopin.com/CZ609532620.htm,深圳,10K-18K,本科,3-5年,https://jobs.zhaopin.com/CC609532620J00356739707.htm 81 | 网络及安全工程师,恒大人寿保险有限公司,1000-9999人,合资,https://company.zhaopin.com/CZ121064920.htm,深圳-南山区,15K-20K,本科,3-5年,https://jobs.zhaopin.com/CC121064921J00381589505.htm 82 | devops工程师,深圳市新类型科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ826945830.htm,深圳,15K-30K,本科,5-10年,https://jobs.zhaopin.com/CZ826945830J00295050808.htm 83 | 测试工程师,北京魅动力教育咨询有限公司,100-499人,民营,https://company.zhaopin.com/CZ487000820.htm,深圳-南山区,12K-18K,本科,3-5年,https://jobs.zhaopin.com/CC487000829J00172326313.htm 84 | 软件测试工程师,深圳睿世达信息科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ483573220.htm,深圳-福田区,8K-10K,大专,3-5年,https://jobs.zhaopin.com/483573223250154.htm 85 | java高级开发工程师,春禾(深圳)自动化技术有限公司,100-499人,民营,https://company.zhaopin.com/CZ548753130.htm,深圳-光明新区,10K-20K,本科,1-3年,https://jobs.zhaopin.com/CZ548753130J00216868602.htm 86 | 算法研究生(计算机视觉方向),深圳市方直科技股份有限公司,100-499人,民营,https://company.zhaopin.com/CZ143270300.htm,深圳-南山区,15K-22K,硕士,不限,https://jobs.zhaopin.com/CC143270306J00372626607.htm 87 | 前端/小程序开发工程师,深圳市大律科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ422692680.htm,深圳,10K-15K,本科,3-5年,https://jobs.zhaopin.com/CZ422692680J00378070905.htm 88 | 算法工程师,深圳市铁越电气有限公司,100-499人,民营,https://company.zhaopin.com/CZ145536890.htm,深圳,11K-19K,本科,1-3年,https://jobs.zhaopin.com/CC145536893J00125372602.htm 89 | 运维工程师,深圳市房一族网络科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ815431260.htm,深圳,13K-16K,本科,3-5年,https://jobs.zhaopin.com/CC815431260J00321222307.htm 90 | 生物信息工程师,广东美格基因科技有限公司,20-99人,其它,http://special.zhaopin.com/pagepublish/47718393/index.html,深圳-龙岗区,6K-12K,本科,不限,https://jobs.zhaopin.com/CC477183933J00293271902.htm 91 | 强化学习,真玫智能科技(深圳)有限公司,100-499人,民营,https://company.zhaopin.com/CZ330043480.htm,深圳-南山区,15K-30K,本科,3-5年,https://jobs.zhaopin.com/CC330043489J00293303305.htm 92 | -------------------------------------------------------------------------------- /exe_file/show.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/show.png -------------------------------------------------------------------------------- /exe_file/xinggan/hpic408_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/hpic408_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic12973_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic12973_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13004_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13004_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13068_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13068_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13087_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13087_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13131_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13131_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13242_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13242_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13256_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13256_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13424_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13424_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13487_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13487_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13589_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13589_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13628_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13628_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13668_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13668_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13710_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13710_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13772_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13772_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic13941_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13941_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14042_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14042_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14131_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14131_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14178_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14178_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14185_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14185_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14298_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14298_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14358_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14358_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14425_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14425_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14458_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14458_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14479_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14479_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14568_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14568_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14603_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14603_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14638_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14638_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14802_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14802_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14872_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14872_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic14965_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14965_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic15059_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15059_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic15084_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15084_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic15247_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15247_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic15324_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15324_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic15420_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15420_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic15469_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15469_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic15567_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15567_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic15608_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15608_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic15786_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15786_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic15891_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15891_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic15920_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15920_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16049_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16049_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16135_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16135_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16191_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16191_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16240_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16240_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16394_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16394_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16406_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16406_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16566_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16566_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16638_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16638_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16686_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16686_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16786_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16786_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16807_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16807_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16817_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16817_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16857_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16857_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16889_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16889_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16921_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16921_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic16949_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16949_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17052_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17052_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17175_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17175_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17202_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17202_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17322_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17322_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17359_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17359_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17378_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17378_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17442_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17442_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17558_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17558_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17615_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17615_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17727_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17727_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17778_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17778_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17797_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17797_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17879_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17879_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic17946_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17946_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic18038_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18038_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic18089_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18089_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic18110_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18110_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic18144_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18144_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic18308_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18308_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic18433_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18433_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic18631_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18631_s.jpg -------------------------------------------------------------------------------- /exe_file/xinggan/zzpic18883_s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18883_s.jpg -------------------------------------------------------------------------------- /exe_file/xpath.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | xpath Test 6 | 7 | 8 |
    9 | 火药 10 | 指南针 11 | 印刷术 12 | 造纸术 13 |
    14 |
    15 |
      16 |
    • 17 | 停车坐爱枫林晚,霜叶红于二月花 18 |
    • 19 |
    • 20 | 商女不知亡国恨,隔江犹唱后庭花 21 |
    • 22 |
    • 23 | 一骑红尘妃子笑,无人知是荔枝来 24 |
    • 25 |
    • 26 | 葡萄美酒夜光杯,欲饮琵琶马上催 27 |
    • 28 |
    • 百度一下
    • 29 |
    30 |
      31 |
    1. 32 | 寻寻觅觅,冷冷清清,凄凄惨惨戚戚 33 |
    2. 34 |
    3. 35 | 乍暖还寒时候,最难将息 36 |
    4. 37 |
    5. 38 | 三杯两盏淡酒 39 |
    6. 40 |
    7. 41 | 怎敌他晚来风 42 |
    8. 43 |
    9. 44 | 雁过也,正伤心,却是旧时相识 45 |
    10. 46 |
    11. 爱就一个字,我只说一次
    12. 47 |
    13. 爱情36计,我要立刻美丽
    14. 48 |
    49 |
    50 | 51 | 52 | -------------------------------------------------------------------------------- /fillder.py: -------------------------------------------------------------------------------- 1 | """ 2 | code by python3.7,utf8 3 | author:caixiaoxin_ 4 | """ 5 | """ 6 | post: 7 | 表单数据的处理:form_data = urllib.parse.urlencode(form_data).encode() 8 | fillder抓包,带箭头小本表示post 9 | 10 | fillder对json的常见查看 11 | 请求部分: 12 | WebForms:查看post请求表单,用于构造post清单 13 | Raw:查看头部信息,构造headers 14 | response部分: 15 | headers-Content-Encoding:查看response的编码----先查就不会有下面的错误了 16 | JSON:查看JSON解析 17 | """ 18 | 19 | 20 | import urllib.request 21 | import urllib.parse 22 | 23 | post_url = 'https://fanyi.baidu.com/sug' 24 | word = 'baby' 25 | 26 | # 构建post表单数据 27 | form_data = { 28 | 'kw':word, 29 | } 30 | 31 | """ 32 | form_data = urllib.parse.urlencode(form_data) 33 | 报错:POST data should be bytes 34 | urlencode结果是utf8,但是post数据规定为字节类型 35 | 需要再使用encode转成字节型 36 | """ 37 | 38 | form_data = urllib.parse.urlencode(form_data).encode() 39 | 40 | #发送请求 41 | headers = { 42 | 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36", 43 | } 44 | 45 | request = urllib.request.Request(url=post_url,headers=headers) 46 | response = urllib.request.urlopen(request,data = form_data) 47 | print(response.read().decode()) 48 | 49 | 50 | 51 | 52 | 53 | # 获得完整的百度翻译json 54 | post_url = 'https://fanyi.baidu.com/v2transapi' 55 | form_data = { 56 | 'from': 'en', 57 | 'to': 'zh', 58 | 'query': 'wolf', 59 | 'transtype': 'realtime', 60 | 'simple_means_flag': '3', 61 | 'sign': '275695.55262', 62 | 'token': 'd7627f387f6d0d573368943337783227', 63 | } 64 | headers = { 65 | 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36", 66 | 'Host': 'fanyi.baidu.com', 67 | 'Connection': 'keep-alive', 68 | # 'Content-Length': '120', 69 | 'Accept': '*/*', 70 | 'Origin': 'https://fanyi.baidu.com', 71 | 'X-Requested-With': 'XMLHttpRequest', 72 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', 73 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 74 | 'Referer': 'https://fanyi.baidu.com/', 75 | # 'Accept-Encoding': 'gzip, deflate, br', 76 | 'Accept-Language': 'zh-CN,zh;q=0.9', 77 | 'Cookie': 'BAIDUID=C609024C7FB6D201F3FDA13AB612DCCD:FG=1; ' 78 | 'BIDUPSID=C609024C7FB6D201F3FDA13AB612DCCD; PSTM=1548991277; ' 79 | 'BDUSS=WdmNU96am9Hc3hNc2J5Mn5DZWFsS3hEYmV4c0lGYWJIM1VEekFPdWxpY' 80 | 'mI1WDFjQVFBQUFBJCQAAAAAAAAAAAEAAAAA0Kows~bK28j8tvu6xTg4AAAAAAAA' 81 | 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANtYVlzbWFZcS; ' 82 | 'delPer=0; PSINO=6; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; ' 83 | 'BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ' 84 | 'H_PS_PSSID=1445_21103_18560_28585_28557_28519_20718; locale=zh; REALTIME_TRANS_SWITCH=1; ' 85 | 'FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; Hm_lvt_64ecd82404c51e03d' 86 | 'c91cb9e8c025574=1551442945,1551442984; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1551442984; to_lang_often=%5B%7B%22value' 87 | '%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; ' 88 | 'from_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D', 89 | 90 | 91 | } 92 | """ 93 | 2 94 | UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte 95 | 完成1后编码错误 96 | 97 | 'Accept-Encoding': 'gzip, deflate, br',注意此条,zip是压缩格式 98 | 屏蔽该头部信息就能得到json,浏览器和fidder自动解json,但是在py中只能借助json在线解析 99 | https://www.json.cn/ 100 | """ 101 | 102 | #提交post三部曲 103 | request = urllib.request.Request(url = post_url,headers=headers) #构建请求对象:也就是伪装headers 104 | form_data = urllib.parse.urlencode(form_data).encode() #构建post清单 105 | response = urllib.request.urlopen(request,data=form_data) #获取url链接 106 | print(response.read().decode()) 107 | 108 | """ 109 | 1 110 | {"error":997,"from":"en","to":"zh","query":"wolf"} 111 | 暴力构造post失败,正常情况下的error是0 112 | 解决方法:将fidder中RAW的全部信息拷贝到headers,如上 113 | """ 114 | -------------------------------------------------------------------------------- /meizhuo_crawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import random 3 | import os 4 | from bs4 import BeautifulSoup 5 | import threading 6 | 7 | class crawler_pic(threading.Thread): 8 | begin_index = 0 # 起始页面 9 | end_index = 0 # 终止页 10 | grads = 20 # 爬取梯度:每个线程爬虫需要执行的爬取页数 11 | # 链接 12 | base_url = "http://www.win4000.com/wallpaper_big_154{}.html" 13 | # 图片保存根目录 14 | file_root = "D://pics_multi//" 15 | # 伪装浏览器 16 | UA = [ 17 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 18 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 20 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" 21 | ] 22 | # 随机构造头部信息 23 | headers = { 24 | "User-Agent": random.choice(UA) 25 | } 26 | def __init__(self, name, begin): 27 | threading.Thread.__init__(self) 28 | self.name = name 29 | self.begin_index = begin 30 | self.end_index = begin + self.grads 31 | # 获取 32 | def get_html(self, url): 33 | try: 34 | HTML = requests.get(url,headers=self.headers) 35 | HTML.raise_for_status() 36 | HTML.encoding = HTML.apparent_encoding 37 | return HTML.text 38 | except: 39 | print("In "+self.name+":ERROR Load "+url) 40 | return "NULL" 41 | # 将获取的图片存储至根目录下 42 | def store_pics(self,pic_urls): 43 | fileName = pic_urls[0]+"//" 44 | for picurl in pic_urls[1:]: 45 | # 构造图片存储地址 46 | path = self.file_root + fileName + picurl.split('/')[-1] 47 | print(path) 48 | 49 | try: 50 | # 需要逐层创建目录 51 | if not os.path.exists(self.file_root): 52 | os.mkdir(self.file_root) 53 | # 如无该目录,先行构建 54 | if not os.path.exists(self.file_root+fileName): 55 | os.mkdir(self.file_root+fileName) 56 | # 图片存在,不重复保存 57 | # 不存在,创建 58 | if not os.path.exists(path): 59 | pic = requests.get(picurl) 60 | with open(path, 'wb') as f: 61 | f.write(pic.content) 62 | f.close() 63 | print("图片:" + picurl + " 成功下载") 64 | else: 65 | print("图片已存在") 66 | except: 67 | print("爬取失败") 68 | return 1 69 | 70 | # 在html页面中获取图片链接,返回链接列表 71 | def get_pic_urls(self, HTML): 72 | 73 | pic_urls = ["filename"] 74 | soup = BeautifulSoup(HTML, "html.parser") 75 | """ 76 | 页面分析: 77 | 图片链接位于标签
    --
  • -- [href:pic_url] 78 | 获取最上层:div 全部子孙标签 选取a 获取a的属性信息 79 | """ 80 | for tag in soup.find("div", attrs={"id": "picBox", "class": "picBox"}).descendants: 81 | if tag.name == 'img': 82 | pic_urls.append(tag.attrs['src']) 83 | pic_urls[0] = tag.attrs['title'] 84 | """ 85 | for a_tag in soup.find("div", attrs={"id": "picBox", "class": "picBox"}).findAll("a"): 86 | pic_urls.append(a_tag.attrs['href']) 87 | """ 88 | # 全局,记录图片数量 89 | global pic_num 90 | pic_num += len(pic_urls) - 1 91 | return pic_urls 92 | 93 | # 线程方法 94 | def run(self): 95 | # 爬取一遍分配的页面 96 | for i in range(self.begin_index,self.end_index): 97 | html = self.get_html(self.base_url.format(i)) 98 | # 页面爬取成功的情况下获取图片链接 99 | if html != "NULL": 100 | pic_urls = self.get_pic_urls(html) 101 | self.store_pics(pic_urls) 102 | """ 103 | for pic in pic_urls: 104 | print("in "+self.name+":"+pic) 105 | """ 106 | 107 | 108 | if __name__ == '__main__': 109 | 110 | threads = [] 111 | count = 0 112 | pic_num = 0 113 | # 构造爬虫 114 | for begin in range(700,900,20): 115 | threads.append(crawler_pic("Thread-begin:"+str(begin),begin)) 116 | 117 | # 开始爬取 118 | for thread in threads: 119 | thread.start() 120 | 121 | for thread in threads: 122 | thread.join() 123 | 124 | 125 | print(pic_num) 126 | -------------------------------------------------------------------------------- /pictureCrawler/PictureDown.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import os 4 | import time 5 | url_root = 'http://www.win4000.com/wallpaper_big_154' 6 | # http://www.win4000.com/wallpaper_big_154(3bits).html 7 | user = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"} 8 | pattern = re.compile(r'http://pic1.win4000.com/wallpaper/[\w|-]+/[\w]+.jpg') 9 | 10 | def get_picture_url(suffix): 11 | try: 12 | url = url_root + str(suffix) + ".html" 13 | print(url) 14 | r = requests.get(url,headers = user) 15 | r.raise_for_status() 16 | validpart = r.text.split('当前位置')[-1] 17 | validpart = validpart.split('listBox')[0] 18 | picurl_list = pattern.findall(validpart) 19 | return picurl_list 20 | except: 21 | print("ERROR") 22 | return ["NULL"] 23 | 24 | def store_pic(picurl_list): 25 | 26 | if "NULL" in picurl_list: 27 | return 0 28 | file_root = "D://pics//" 29 | 30 | for picurl in picurl_list: 31 | path = file_root + picurl.split('/')[-1] 32 | try: 33 | if not os.path.exists(file_root): 34 | os.mkdir(file_root) 35 | if not os.path.exists(path): 36 | pic = requests.get(picurl) 37 | with open(path,'wb') as f: 38 | f.write(pic.content) 39 | f.close() 40 | print("图片:"+picurl+" 成功下载") 41 | else: 42 | print("图片已存在") 43 | except: 44 | print("爬取失败") 45 | return 1 46 | 47 | if __name__ == '__main__': 48 | for suffix in range(800,900): 49 | store_pic(get_picture_url(suffix)) 50 | time.sleep(5) -------------------------------------------------------------------------------- /pictureCrawler/depthPicCrawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import random 3 | import _thread 4 | import threading 5 | import re 6 | from bs4 import BeautifulSoup 7 | UA = [ 8 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 9 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 10 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 11 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" 12 | ] 13 | # 随机构造头部信息 14 | headers = { 15 | "User-Agent": random.choice(UA) 16 | } 17 | global thread_max_num 18 | thread_max_num = 20 19 | init_links = ['http://www.win4000.com/wallpaper_193_0_0_1.html', 'http://www.win4000.com/wallpaper_0_0_0_1.html', 20 | 'http://www.win4000.com/hj/haolanzhuan.html', 'http://www.win4000.com/wallpaper_192_0_0_1.html', 21 | 'http://www.win4000.com/wallpaper.html', 'http://www.win4000.com/wallpaper_detail_155224.html', 'http://www.win4000.com/mt/index.html', 22 | 'http://www.win4000.com/wallpaper_201_0_0_1.html', 'http://www.win4000.com/meitu.html', 23 | 'http://www.win4000.com/wallpaper_197_0_0_1.html', 'http://www.win4000.com/wallpaper_195_0_0_1.html', 24 | 'http://www.win4000.com/mobile.html', 'http://www.win4000.com/retu.html','http://www.win4000.com' 25 | 'http://www.win4000.com/wallpaper_194_0_0_1.html', 'http://www.win4000.com/zt/index.html', 26 | 'http://www.win4000.com/hj/index.html', 'http://www.win4000.com/wallpaper_191_0_0_1.html', 27 | 'http://www.win4000.com/wallpaper_196_0_0_1.html', 'http://www.win4000.com/mt/star.html'] 28 | pages = set() 29 | class myThread(threading.Thread): 30 | def __init__(self,name,url): 31 | threading.Thread.__init__(self) 32 | self.name = name 33 | self.url = url 34 | def run(self): 35 | crawler(self.name,self.url,1) 36 | 37 | def get_html(url): 38 | try: 39 | HTML = requests.get(url, headers=headers) 40 | HTML.raise_for_status() 41 | HTML.encoding=HTML.apparent_encoding 42 | return HTML.text 43 | except: 44 | # print("ERROR:"+url) 45 | return "NULL" 46 | 47 | 48 | def crawler(thread_name,url,depth): 49 | if depth > 20: 50 | return 51 | demo = get_html(url) 52 | try: 53 | soup = BeautifulSoup(demo,"html.parser") 54 | get_pic_url(soup) 55 | for link in soup.findAll("a",href=re.compile("http://www.win4000.com/[\S]*.html")): 56 | if "href" in link.attrs: 57 | if link.attrs['href'] not in pages: 58 | newpage = link.attrs['href'] 59 | pages.add(newpage) 60 | crawler(thread_name,newpage,depth+1) 61 | 62 | 63 | except: 64 | print("e!") 65 | pass 66 | #
    67 | # 下载图片 68 | pic_urls = set() 69 | def get_pic_url(soup): 70 | try: 71 | a_tag = soup.find("div",attrs={"class":"paper-down"}).a 72 | if "href" in a_tag.attrs: 73 | pic_url = a_tag['href'] 74 | if pic_url not in pic_urls: 75 | title = soup.find("h1").string 76 | pic_urls.add(pic_url) 77 | print("NO."+str(len(pic_urls))+"-"+title+":"+pic_url) 78 | except: 79 | pass 80 | 81 | 82 | threads = [] 83 | counter = 0 84 | for link in init_links: 85 | threads.append(myThread(str(counter),link)) 86 | counter += 1 87 | for thread in threads: 88 | thread.start() 89 | for thread in threads: 90 | thread.join() 91 | 92 | """ 93 | url = "http://www.win4000.com/wallpaper_detail_40709.html" 94 | demo = get_html(url) 95 | soup = BeautifulSoup(demo,"html.parser") 96 | get_pic_url(soup) 97 | """ -------------------------------------------------------------------------------- /pictureCrawler/informationMark.py: -------------------------------------------------------------------------------- 1 | # XML 2 | """ 3 | ... 4 | 缩写 5 | 注释 6 | """ 7 | #JSON 有类型的键值对 8 | """ 9 | "key" : "value" 10 | "key" : ["value1","value2"] 11 | "key" : {"subkey" : "subvalue"} 嵌套键值对采用花括号 12 | """ 13 | 14 | # YAML 缩进体现所属关系 15 | """ 16 | 1: "|"表示整块可跨行信息 17 | 18 | key : value 19 | key : #comment 20 | - value1 21 | - value2 22 | key : 23 | subkey : subvalue 24 | """ 25 | 26 | from bs4 import BeautifulSoup 27 | import requests 28 | import re 29 | 30 | r = requests.get("https://st.58.com/chuzu/?PGTID=0d100000-0030-f99b-60c3-61bb358828a0&ClickID=3") 31 | demo = r.text 32 | soup = BeautifulSoup(demo,"html.parser") 33 | # print(soup.prettify()) 34 | """ 35 | for link in soup.find_all('a'): 36 | print(link.get('href')) 37 | """ 38 | allTag=[] 39 | for tag in soup.find_all("div","des"): 40 | a_soup = BeautifulSoup(tag.text,"html.parser") 41 | for a_tag in a_soup("a",tongji_label="listclick", 42 | onclick="clickLog('from=fcpc_zflist_gzcount');", 43 | target="_blank",rel="nofollow"): 44 | print(str(a_tag.string).strip()) 45 | for a_tag in soup.find_all("p","room strongbox"): 46 | print(str(a_tag.string).strip()) 47 | 48 | 49 | # 利用正则搜索 50 | allTag = [] 51 | for tag in soup.find_all(re.compile('h')): 52 | if tag.name not in allTag: 53 | allTag.append(tag.name) 54 | # print(tag.name) 55 | else: 56 | pass 57 | 58 | # 重点!!! 59 | # 规定标签"img" 及标签属性 alt="孟子义写真图片高清桌面壁纸" 可准确找到所查找信息,"孟子义写真图片高清桌面壁纸"是准确匹配 60 | # 模糊匹配用正则 61 | for tag in soup.find_all("img",alt="孟子义写真图片高清桌面壁纸"): 62 | print(tag.get('src')) -------------------------------------------------------------------------------- /pictureCrawler/multiPicDown.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import random 3 | import os 4 | from bs4 import BeautifulSoup 5 | import threading 6 | 7 | class crawler_pic(threading.Thread): 8 | begin_index = 0 # 起始页面 9 | end_index = 0 # 终止页 10 | grads = 20 # 爬取梯度:每个线程爬虫需要执行的爬取页数 11 | # 链接 12 | base_url = "http://www.win4000.com/wallpaper_big_154{}.html" 13 | # 图片保存根目录 14 | file_root = "D://pics_multi//" 15 | # 伪装浏览器 16 | UA = [ 17 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 18 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 20 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" 21 | ] 22 | # 随机构造头部信息 23 | headers = { 24 | "User-Agent": random.choice(UA) 25 | } 26 | def __init__(self, name, begin): 27 | threading.Thread.__init__(self) 28 | self.name = name 29 | self.begin_index = begin 30 | self.end_index = begin + self.grads 31 | # 获取 32 | def get_html(self, url): 33 | try: 34 | HTML = requests.get(url,headers=self.headers) 35 | HTML.raise_for_status() 36 | HTML.encoding = HTML.apparent_encoding 37 | return HTML.text 38 | except: 39 | print("In "+self.name+":ERROR Load "+url) 40 | return "NULL" 41 | # 将获取的图片存储至根目录下 42 | def store_pics(self,pic_urls): 43 | fileName = pic_urls[0]+"//" 44 | for picurl in pic_urls[1:]: 45 | # 构造图片存储地址 46 | path = self.file_root + fileName + picurl.split('/')[-1] 47 | print(path) 48 | 49 | try: 50 | # 需要逐层创建目录 51 | if not os.path.exists(self.file_root): 52 | os.mkdir(self.file_root) 53 | # 如无该目录,先行构建 54 | if not os.path.exists(self.file_root+fileName): 55 | os.mkdir(self.file_root+fileName) 56 | # 图片存在,不重复保存 57 | # 不存在,创建 58 | if not os.path.exists(path): 59 | # request获取图片内容 60 | pic = requests.get(picurl) 61 | with open(path, 'wb') as f: 62 | f.write(pic.content) 63 | f.close() 64 | print("图片:" + picurl + " 成功下载") 65 | else: 66 | print("图片已存在") 67 | except: 68 | print("爬取失败") 69 | return 1 70 | 71 | # 在html页面中获取图片链接,返回链接列表 72 | def get_pic_urls(self, HTML): 73 | 74 | pic_urls = ["filename"] 75 | soup = BeautifulSoup(HTML, "html.parser") 76 | """ 77 | 页面分析: 78 | 图片链接位于标签
    --
  • -- [href:pic_url] 79 | 获取最上层:div 全部子孙标签 选取a 获取a的属性信息 80 | """ 81 | for tag in soup.find("div", attrs={"id": "picBox", "class": "picBox"}).descendants: 82 | if tag.name == 'img': 83 | pic_urls.append(tag.attrs['src']) 84 | pic_urls[0] = tag.attrs['title'] 85 | """ 86 | for a_tag in soup.find("div", attrs={"id": "picBox", "class": "picBox"}).findAll("a"): 87 | pic_urls.append(a_tag.attrs['href']) 88 | """ 89 | # 全局,记录图片数量 90 | global pic_num 91 | pic_num += len(pic_urls) - 1 92 | return pic_urls 93 | 94 | # 线程方法 95 | def run(self): 96 | # 爬取一遍分配的页面 97 | for i in range(self.begin_index,self.end_index): 98 | html = self.get_html(self.base_url.format(i)) 99 | # 页面爬取成功的情况下获取图片链接 100 | if html != "NULL": 101 | pic_urls = self.get_pic_urls(html) 102 | self.store_pics(pic_urls) 103 | """ 104 | for pic in pic_urls: 105 | print("in "+self.name+":"+pic) 106 | """ 107 | 108 | 109 | if __name__ == '__main__': 110 | 111 | threads = [] 112 | count = 0 113 | pic_num = 0 114 | # 构造爬虫 115 | for begin in range(700,900,20): 116 | threads.append(crawler_pic("Thread-begin:"+str(begin),begin)) 117 | 118 | # 开始爬取 119 | for thread in threads: 120 | thread.start() 121 | 122 | for thread in threads: 123 | thread.join() 124 | 125 | 126 | print(pic_num) -------------------------------------------------------------------------------- /tesseract训练模型/0-9A-Z.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/tesseract训练模型/0-9A-Z.png -------------------------------------------------------------------------------- /tesseract训练模型/README.md: -------------------------------------------------------------------------------- 1 | # 验证码识别-训练字库 2 | ## Purpose 3 | >验证码的存在使得在模拟登陆时会遇到障碍,通过训练通用验证码识别库,可以通过加入几行代码轻松登陆,对网页进行测试\ 4 | >通过建立特征字符库,逐层加入识别错误的验证码进行补充训练,可以在三次扩充样本训练后达到90%以上识别率\ 5 | >![验证码样例](oriCode.png) 6 | ## Tools 7 | >**Tesseract** 8 | >>**基于OCR原理**:Optical Character Recognition,光学字符识别,是指通过扫描字符,然后通过其形状将其翻译成电子文本的过程。\ 9 | 对于图形验证码来说,它们都是一些不规则的字符,这些字符确实是由字符稍加扭曲变换得到的内容。\ 10 | >>[Windows安装Tesseract-OCR 4.00并配置环境变量](https://segmentfault.com/a/1190000014086067):现在最新版本为5.00,安装及配置方法一致\ 11 | >>[tesseract v4.0.0 帮助文档解读](https://blog.csdn.net/qq_32674197/article/details/80744783) 12 | 13 | >**jTessBoxEditor** 14 | >>安装后记得把ocr目录下文件全部替换成Tesseract的OCR目录下的文件 15 | 16 | >**pytesseract** 17 | >>安装:**pip install pytesseract**\ 18 | >>简介:基于Tessract的图片光学字符识别库\ 19 | >>[官方文档](https://pypi.org/project/pytesseract/) 20 | 21 | 22 | >**PIL** 23 | >>类似与OpenCV的图像处理库,用于验证码的预处理\ 24 | >>[PIL介绍](https://www.cnblogs.com/lyrichu/p/9124504.html)\ 25 | >>[Python图像处理库PIL的ImageEnhance模块介绍](https://blog.csdn.net/icamera0/article/details/50753705) 26 | 27 | ## Training 28 | >**预处理验证码图片** 29 | >>目的:原始验证码图片存在背景纹路等干扰,通过PIL库的ImageEnhance模块进行预处理,突出文本\ 30 | >>下面代码介绍了一种参数配置方法,可以很有效突出识别文本特征 31 | >>```python 32 | >>from PIL import Image 33 | >>from PIL import ImageEnhance 34 | >>img = Image.open('exe_file/11/code1.png') 35 | >>print(img) 36 | >>img= img.convert('RGB') 37 | >># 颜色调到最暗 38 | >>enhancer = ImageEnhance.Color(img) 39 | >>enhancer = enhancer.enhance(0) 40 | >># 增加亮度 41 | >>enhancer = ImageEnhance.Brightness(enhancer) 42 | >>enhancer = enhancer.enhance(4) 43 | >># 增加对比度 44 | >>enhancer = ImageEnhance.Contrast(enhancer) 45 | >>enhancer = enhancer.enhance(15) 46 | >># 增加图片锐度 47 | >>enhancer = ImageEnhance.Sharpness(enhancer) 48 | >>img = enhancer.enhance(25) 49 | >># img.show() 50 | >># 转成灰度图片 51 | >>img = img.convert('L') 52 | >># img.show() 53 | >>#二值化处理 54 | >>threshold = 140 55 | >>table=[] 56 | >>for i in range(256): 57 | >> if i < threshold: 58 | >> table.append(0) 59 | >> else: 60 | >> table.append(1) 61 | >>out = img.point(table,'1') 62 | >>out.show() 63 | >>``` 64 | >**第一次训练:建立单字符字库** 65 | >>对于该网站,验证码只存在0-9A-Z 36个字符,单独提取36个字符的图片,采用默认字库进行训练,利用jTessBoxEditor进行矫正后生成新字库\ 66 | >>![sigleChar](0-9A-Z.png) 67 | 68 | >**第二次训练:扩充字库** 69 | >>python脚本获取识别错误的验证码(脚本最后会给出传送门)\ 70 | >>利用jTessBoxEditor将错误识别的验证码集成tif文件与第一次字库的训练样本(tif文件)合并\ 71 | >>![combine](combine.png)\ 72 | >>利用第一次训练的字库将tif文件进行测试,生成box文件,进行人工纠错\ 73 | >>**操作** 74 | >>>将文件路径改为tif文件所在目录\ 75 | >>>tif文件命名规则 [语言名].[font].[exp0].tif\ 76 | >>>gu是第一轮训练的字库名 77 | >>```commandline 78 | >>cd C:\Users\crayon\OneDrive\Pycode\Crawler\crawler_Basic\exe_file\11\gushiwen_code 79 | >> 80 | >>tesseract gu.font.exp0.tif gu.font.exp0 -l gu --psm 7 batch.nochop makebox 81 | >>``` 82 | >>更新字库 83 | >>```commandlinecd 84 | >>C:\Users\crayon\OneDrive\Pycode\Crawler\crawler_Basic\exe_file\11\gushiwen_code 85 | >>echo font 0 0 0 0 0>font_properties 86 | >> 87 | >>echo Run Tesseract for Training.. 88 | >>tesseract.exe --psm 10 gu.font.exp0.tif gu.font.exp0 nobatch box.train 89 | >> 90 | >>echo Compute the Character Set.. 91 | >>unicharset_extractor.exe gu.font.exp0.box 92 | >>mftraining -F font_properties -U unicharset -O gu.unicharset gu.font.exp0.tr 93 | >> 94 | >>echo Clustering.. 95 | >>cntraining.exe gu.font.exp0.tr 96 | >> 97 | >>echo Rename Files.. 98 | >>rename normproto gu.normproto 99 | >>rename inttemp gu.inttemp 100 | >>rename pffmtable gu.pffmtable 101 | >>rename shapetable gu.shapetable 102 | >> 103 | >>echo Create Tessdata.. 104 | >>combine_tessdata.exe gu. 105 | >>``` 106 | 107 | >第n次训练 108 | >>同第二次训练步骤\ 109 | >>遇到的问题:Empty Page 一般是没有指定训练模式造成的,通过设置psm可以解决 110 | 111 | ## references 112 | >[python+tesseract 训练和破解验证码](https://zhuanlan.zhihu.com/p/40178190) 113 | >>非常详细地介绍了训练字库的步骤,比较有特色的是该作者利用字符色彩不同进行单字符提取,但是这仅对单个字符同色位有效\ 114 | >>对于本文的验证码,这种方法就无效了,还是需要利用截图工具 115 | 116 | >[Tesseract-OCR样本训练方法](https://blog.csdn.net/sylsjane/article/details/83751297) 117 | >>介绍了指令文件运行,写成bat文件后就不用重复劳动 118 | 119 | >[字库合并](https://www.imooc.com/article/32331) 120 | >>本文提出的方法有个缺陷,每次都需要对图片集人工矫正一遍,但实际前一次训练图片集一般是不需要校对的\ 121 | >>这篇文章介绍了box文件合并的技巧,每次只需对新加入的样本进行矫正,从而可以减少训练字库的工作量 122 | 123 | ## Resource 124 | >[python验证码测试脚本](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/11.2jTessBoxEditor-tesseract.py) 125 | >>[功能1]图片预处理,突出文本\ 126 | >>[功能2]验证码测试,利用训练的字库测试验证码,将无法正确识别的验证码保存 127 | 128 | >[训练字库](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/exe_file/11/gushiwen_code/gu.traineddata) 129 | >>该字库在第三轮训练中产生,可以达到90%以上的准确率 130 | 131 | >训练命令行指令文件 132 | >>[生成box文件](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/exe_file/11/gushiwen_code/train_toBox.bat)\ 133 | >>[生成字库](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/exe_file/11/gushiwen_code/gu.bat) 134 | 135 | >[训练文件集合](https://github.com/ZhuoZhuoCrayon/pythonCrawler/tree/master/exe_file/11/gushiwen_code) 136 | >>保存了每次训练的图片集合、tif、box、训练字库等文件 137 | -------------------------------------------------------------------------------- /tesseract训练模型/combine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/tesseract训练模型/combine.png -------------------------------------------------------------------------------- /tesseract训练模型/oriCode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/tesseract训练模型/oriCode.png -------------------------------------------------------------------------------- /zhilianCrawler.py: -------------------------------------------------------------------------------- 1 | import urllib.parse 2 | import urllib.request 3 | import re 4 | import csv 5 | import time 6 | import json 7 | import gzip 8 | from io import StringIO 9 | from bs4 import BeautifulSoup 10 | 11 | class crawler: 12 | date = '20190320' 13 | 14 | # 城市编码 15 | cityIds = { 16 | '北京': '530', 17 | '上海': '538', 18 | '广州': '763', 19 | '深圳': '765', 20 | '杭州': '653', 21 | '天津': '531', 22 | '武汉': '736', 23 | '重庆': '551', 24 | '苏州': '639', 25 | '南京': '635', 26 | '长沙': '749', 27 | } 28 | UA = [ 29 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 30 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 31 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 32 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" 33 | ] 34 | # 随机构造头部信息 35 | headers = { 36 | "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36', 37 | 'Host': 'fe-api.zhaopin.com', 38 | 'Upgrade-Insecure-Requests': '1', 39 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 40 | 'Accept-Language': 'zh-CN,zh;q =0.9', 41 | 'Cache-Control': 'max-age=0', 42 | 'Connection': 'keep-alive' 43 | } 44 | 45 | zhilian_url = 'https://fe-api.zhaopin.com/c/i/sou?' 46 | start_page = 1 47 | end_page = 100 48 | pos_infor = { 49 | 'local':'NULL', 50 | 'name' :'NULL', 51 | 'size':'NULL', 52 | 'type' :'NULL', 53 | 'position' :'NULL', 54 | 'education':'NULL', 55 | 'experience' :'NULL', 56 | 'need' :'NULL', 57 | 'salary' :'NULL', 58 | 'welfare' :'NULL', 59 | 'num' :'NULL', 60 | 'workplace' :'NULL', 61 | 'companyPage' :'NULL', 62 | } 63 | 64 | def __init__(self,city,keyword): 65 | self.kw = keyword 66 | self.cityId = self.cityIds[city] 67 | csv_url = 'csvfile//智联招聘-'+city+'-'+keyword+'-'+self.date+'.csv' 68 | self.fp = open(csv_url,'wt',newline='',encoding='utf-8-sig') 69 | self.writer = csv.writer(self.fp) 70 | self.writer.writerow(('地区', '企业名称', '企业规模', '企业类别', '招聘岗位', '学历', '工作经验', '工作职责/要求', '薪酬', '福利', '招聘人数', '工作地点', '公司主页')) 71 | 72 | def textDecoration(self,text): 73 | delspace = re.compile(r'\s+') 74 | text = delspace.sub('',text) 75 | set_newline = re.compile(r'[;|:|。|!]') 76 | text = set_newline.sub('\n',text) 77 | text = text.replace("展开",'').strip() 78 | return text 79 | 80 | def getPosition(self,url): 81 | request = urllib.request.Request(url=url,headers=self.headers) 82 | content = urllib.request.urlopen(request) 83 | if content.info().get('Content-Encoding') == 'gzip': 84 | buf = StringIO(content.read()) 85 | f = gzip.GzipFile(fileobj=buf) 86 | content = f.read() 87 | else: 88 | content = content.read() 89 | soup = BeautifulSoup(content,'lxml') 90 | main = soup.find('div',class_ = 'main') 91 | 92 | 93 | lis = main.find('div',class_ = 'main1 cl main1-stat').find_all('li') 94 | 95 | self.pos_infor['salary'] = lis[0].strong.text; 96 | self.pos_infor['position'] = lis[0].h1.text; 97 | 98 | self.pos_infor['name'] = lis[1].find('div',class_ = 'company l').a.text; 99 | _companyPage = lis[1].find('div',class_ = 'company l').a.attrs['href'] 100 | spans = lis[1].find('div',class_ = 'info-three l').find_all('span') 101 | 102 | self.pos_infor['local'] = spans[0].a.text 103 | self.pos_infor['experience'] = spans[1].text 104 | self.pos_infor['education'] = spans[2].text 105 | self.pos_infor['num'] = spans[3].text[1:-1] 106 | 107 | # 福利信息异步加载,改为在json中提取 108 | """ 109 | # pos_info_in = main.find('div',class_ = 'l pos-info-in') 110 | # print(pos_info_in) 111 | welfareSpans = main.find('div',class_ = 'l pos-info-in').find_all('div',class_ = 'pos-info-tit') 112 | print(welfareSpans) 113 | # self.welfare = '' 114 | for span in welfareSpans: 115 | print(span.text) 116 | # self.welfare += str(span.text())+'\n' 117 | """ 118 | 119 | companyAttrs = main.find('ul',class_ = 'promulgator-ul cl') 120 | lis1 = companyAttrs.find_all('li') 121 | 122 | self.pos_infor['type'] = lis1[1].strong.text 123 | self.pos_infor['size'] = lis1[2].strong.text 124 | self.pos_infor['companyPage'] = lis1[3].strong.a['href'] 125 | if self.pos_infor['companyPage'] == '' or self.pos_infor['companyPage'] =='NULL': 126 | self.pos_infor['companyPage'] = _companyPage 127 | 128 | self.pos_infor['workplace'] = lis1[4].strong.text 129 | self.pos_infor['need'] = self.textDecoration(main.find('div',class_ = 'responsibility pos-common').get_text()) 130 | 131 | self.writer.writerow((self.pos_infor['local'],self.pos_infor['name'],self.pos_infor['size'],self.pos_infor['type'],self.pos_infor['position'] , 132 | self.pos_infor['education'],self.pos_infor['experience'],self.pos_infor['need'] ,self.pos_infor['salary'], 133 | self.pos_infor['welfare'],self.pos_infor['num'],self.pos_infor['workplace'],self.pos_infor['companyPage'])) 134 | """ 135 | print(self.pos_infor['local'],self.pos_infor['name'],self.pos_infor['size'],self.pos_infor['type'],self.pos_infor['position'] , 136 | self.pos_infor['education'],self.pos_infor['experience'],self.pos_infor['need'] ,self.pos_infor['salary'], 137 | self.pos_infor['welfare'],self.pos_infor['num'],self.pos_infor['workplace'],self.pos_infor['companyPage']) 138 | """ 139 | 140 | def handle_request(self,page): 141 | data = { 142 | 'start':90*(page-1), 143 | 'pageSize':'90', 144 | 'cityId':self.cityId, 145 | 'workExperience':'-1', 146 | 'education':'-1', 147 | 'companyType':'-1', 148 | 'employmentType':'-1', 149 | 'jobWelfareTag':'-1', 150 | 'kw': self.kw, 151 | 'kt':'3', 152 | '_v':'0.70987222', 153 | 'x-zp-page-request-id':'5c93296b093c49febba0d63d812d38d6-1553071553649-676137', 154 | } 155 | 156 | url = self.zhilian_url + urllib.parse.urlencode(data) 157 | print(url) 158 | request = urllib.request.Request(url = url, headers = self.headers) 159 | return request 160 | 161 | # requests.get(url, headers=headers) 162 | 163 | def parse_content(self, content): 164 | selector = json.loads(content) 165 | # print(selector) 166 | data = selector['data']['results'] 167 | 168 | if len(data) == 0: 169 | return 'crawler all' 170 | 171 | for position in data: 172 | # print(position['positionURL']) 173 | self.pos_infor['welfare'] ='' 174 | for _welfare in position['welfare']: 175 | self.pos_infor['welfare'] += _welfare + '\n' 176 | try: 177 | self.getPosition(position['positionURL']) 178 | except: 179 | pass 180 | time.sleep(0.1) 181 | 182 | 183 | return 'next page' 184 | 185 | def run(self): 186 | for page in range(self.start_page,self.end_page+1): 187 | 188 | request = self.handle_request(page) 189 | content = urllib.request.urlopen(request) 190 | 191 | """ 192 | html = content.read() 193 | print(html) 194 | buff = BytesIO(html) 195 | f = gzip.GzipFile(fileobj=buff) 196 | content = f.read().decode() 197 | print(content) 198 | """ 199 | if content.info().get('Content-Encoding')=='gzip': 200 | buf = StringIO(content.read()) 201 | f = gzip.GzipFile(fileobj=buf) 202 | content = f.read() 203 | else: 204 | content = content.read() 205 | 206 | status = self.parse_content(content) 207 | if status == 'crawler all': 208 | print('crawler all') 209 | break 210 | else: 211 | print('crawler end in Page.'+str(page)) 212 | time.sleep(0.5) 213 | 214 | import threading 215 | class crawlerThread(threading.Thread): 216 | 217 | def __init__(self,name,city,keyword): 218 | threading.Thread.__init__(self) 219 | self.name = name 220 | self.city = city 221 | self.keyword = keyword 222 | def run(self): 223 | print(self.name) 224 | test = crawler(self.city,self.keyword) 225 | test.run() 226 | print(self.name+"-----------------get all now!") 227 | 228 | 229 | 230 | if __name__ == '__main__': 231 | cities = [ 232 | '北京', 233 | '上海', 234 | '广州', 235 | '深圳', 236 | '杭州', 237 | '天津', 238 | '武汉', 239 | '重庆', 240 | '苏州', 241 | '南京', 242 | '长沙',] 243 | positions_IT = ['Java开发', 244 | 'UI设计师', 245 | 'Web前端', 246 | 'PHP', 247 | 'Python', 248 | 'Android', 249 | '深度学习', 250 | '算法工程师', 251 | 'hadoop', 252 | 'Node.js', 253 | '数据开发', 254 | '数据分析师', 255 | '数据架构', 256 | '人工智能' 257 | '区块链' 258 | ] 259 | positions_Finance = [ 260 | '投资经理', 261 | '风控', 262 | '催收', 263 | '银行柜员', 264 | '银行销售', 265 | '信审', 266 | '信用卡', 267 | '贷款', 268 | '金融产品', 269 | '汽车金融', 270 | '金融研究', 271 | '证券交易员', 272 | '投资经理', 273 | '期货', 274 | '操盘手', 275 | '基金', 276 | '股票', 277 | '投资顾问', 278 | '信托', 279 | '典当', 280 | '担保', 281 | '信贷', 282 | '权证', 283 | '保险', 284 | '理赔', 285 | '精算师', 286 | '理财', 287 | '顾问', 288 | '查勘定损', 289 | '车险' 290 | ] 291 | 292 | 293 | for city in cities: 294 | threads = [] 295 | for position in positions_Finance: 296 | thread = crawlerThread(city+'-'+position,city,position) 297 | # thread.start() 298 | # thread.join() 299 | threads.append(thread) 300 | for thread in threads: 301 | thread.start() 302 | for thread in threads: 303 | thread.join() 304 | 305 | --------------------------------------------------------------------------------